diff --git a/.clang-format b/.clang-format
index 04f2bbaf85b2c..a4de8e7be8e07 100644
--- a/.clang-format
+++ b/.clang-format
@@ -6,11 +6,11 @@
 # The basic usage is,
 #   clang-format -i -style=file PATH/TO/SOURCE/CODE
 #
-# The -style=file implicit use ".clang-format" file located in one of 
-# parent directory. 
+# The -style=file implicit use ".clang-format" file located in one of
+# parent directory.
 # The -i means inplace change.
 #
-# The document of clang-format is 
+# The document of clang-format is
 #   http://clang.llvm.org/docs/ClangFormat.html
 #   http://clang.llvm.org/docs/ClangFormatStyleOptions.html
 ---
@@ -20,7 +20,7 @@ IndentWidth:     2
 TabWidth:        2
 ContinuationIndentWidth: 4
 AccessModifierOffset: -1  # The private/protected/public has no indent in class
-Standard:  Cpp11 
+Standard:  Cpp11
 AllowAllParametersOfDeclarationOnNextLine: true
 BinPackParameters: false
 BinPackArguments: false
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 1fcb3dc4f521d..7b62f131b9587 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -53,7 +53,6 @@ python/paddle/base/compiler.py @XiaoguangHu01 @zhiqiu @Xreki @qili93 @Aurelius84
 python/paddle/base/dygraph/layers.py @JiabinYang @phlrain
 python/paddle/base/framework.py @XiaoguangHu01 @zhiqiu @Xreki @qili93 @Aurelius84
 python/paddle/base/__init__.py @phlrain @Aurelius84 @qili93
-python/paddle/base/parallel_executor.py @Xreki @zhhsplendid @Aurelius84
 python/paddle/base/tests/unittests/white_list/check_op_sequence_batch_1_input_white_list.py @Aurelius84 @phlrain
 python/paddle/base/tests/unittests/white_list/check_op_sequence_instance_0_input_white_list.py @Aurelius84 @phlrain
 python/paddle/base/tests/unittests/white_list/check_shape_white_list.py @hong19860320 @Aurelius84 @phlrain
diff --git a/.gitignore b/.gitignore
index 12abbf0f03caa..667ca443fe77e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -106,3 +106,7 @@ paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/*
 paddle/fluid/pybind/static_op_function.*
 paddle/fluid/pybind/ops_api.cc
 python/paddle/tensor/tensor.pyi
+paddle/phi/kernels/fusion/cutlass/conv2d/build
+paddle/phi/kernels/fusion/cutlass/conv2d/cutlass
+paddle/phi/kernels/fusion/cutlass/gemm_epilogue/build
+paddle/phi/kernels/fusion/cutlass/gemm_epilogue/cutlass
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 3ea3a927cf7bc..9e56241f69176 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -20,7 +20,6 @@ repos:
     -   id: sort-simple-yaml
         files: (ops|backward|op_[a-z_]+)\.yaml$
     -   id: trailing-whitespace
-        files: (.*\.(py|bzl|md|rst|c|cc|cxx|cpp|cu|h|hpp|hxx|xpu|kps|cmake|yaml|yml|hook)|BUILD|.*\.BUILD|WORKSPACE|CMakeLists\.txt)$
 -   repo: https://github.com/Lucas-C/pre-commit-hooks.git
     rev: v1.5.1
     hooks:
@@ -55,7 +54,6 @@ repos:
     rev: 23.3.0
     hooks:
     -   id: black
-        files: (.*\.(py|pyi|bzl)|BUILD|.*\.BUILD|WORKSPACE)$
 -   repo: https://github.com/astral-sh/ruff-pre-commit
     rev: v0.3.5
     hooks:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0aa41a26d700e..f0b2fa79d362a 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -99,6 +99,9 @@ if(WITH_GPU AND WITH_ROCM)
 endif()
 
 if(WITH_GPU AND NOT APPLE)
+  if(WITH_PIP_CUDA_LIBRARIES AND CMAKE_SYSTEM_NAME STREQUAL "Windows")
+    add_definitions(-DPADDLE_WITH_PIP_CUDA_LIBRARIES)
+  endif()
   #(Note risemeup1): The cudart dynamic library libcudart.so is used by set CUDA_USE_STATIC_CUDA_RUNTIME and CMAKE_CUDA_FLAGS
   if(CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL
                                             "x86_64")
@@ -107,8 +110,8 @@ if(WITH_GPU AND NOT APPLE)
         CACHE BOOL "" FORCE)
     set(CMAKE_CUDA_FLAGS "--cudart shared")
     if(WITH_PIP_CUDA_LIBRARIES)
-      #(Note risemeup1): Flag 'WITH_PIP_CUDA_LIBRARIES' will be used in dynamic_loader.cc to search for CUDA-related .so files through the Python libraries provided by NVIDIA.
-      add_definitions(-DWITH_PIP_CUDA_LIBRARIES)
+      #(Note risemeup1): Flag 'PADDLE_WITH_PIP_CUDA_LIBRARIES' will be used in dynamic_loader.cc to search for CUDA-related .so files through the Python libraries provided by NVIDIA.
+      add_definitions(-DPADDLE_WITH_PIP_CUDA_LIBRARIES)
     endif()
   endif()
   enable_language(CUDA)
diff --git a/cmake/PaddleConfig.cmake.in b/cmake/PaddleConfig.cmake.in
index d32c23f6f6edd..e55038bb77c63 100644
--- a/cmake/PaddleConfig.cmake.in
+++ b/cmake/PaddleConfig.cmake.in
@@ -12,7 +12,7 @@
 get_filename_component(PADDLE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_FILE}/../.." ABSOLUTE)
 
 # include directories
-set(PADDLE_INCLUDE_DIRS 
+set(PADDLE_INCLUDE_DIRS
     ${PADDLE_INSTALL_PREFIX}/include
     ${PADDLE_INSTALL_PREFIX}/include/third_party
 )
diff --git a/cmake/cinn/external/absl.cmake b/cmake/cinn/external/absl.cmake
index 8d9e0e45b45ba..46859d7caa871 100644
--- a/cmake/cinn/external/absl.cmake
+++ b/cmake/cinn/external/absl.cmake
@@ -63,6 +63,10 @@ set(ABSL_LIB_NAMES
     raw_hash_set)
 set(ABSL_LIBS "")
 
+if(WITH_ROCM)
+  list(APPEND ABSL_LIB_NAMES strings_internal raw_logging_internal)
+endif()
+
 add_library(absl STATIC IMPORTED GLOBAL)
 set_property(TARGET absl PROPERTY IMPORTED_LOCATION
                                   ${ABSL_INSTALL_DIR}/lib/libabsl_base.a)
diff --git a/cmake/hip.cmake b/cmake/hip.cmake
index 6efed5b468576..9c43b15b28a63 100644
--- a/cmake/hip.cmake
+++ b/cmake/hip.cmake
@@ -136,11 +136,7 @@ list(APPEND HIP_CXX_FLAGS -Wno-unused-local-typedef)
 list(APPEND HIP_CXX_FLAGS -Wno-missing-braces)
 list(APPEND HIP_CXX_FLAGS -Wno-sometimes-uninitialized)
 
-if(WITH_CINN)
-  list(APPEND HIP_CXX_FLAGS -std=c++14)
-else()
-  list(APPEND HIP_CXX_FLAGS -std=c++17)
-endif()
+list(APPEND HIP_CXX_FLAGS -std=c++17)
 list(APPEND HIP_CXX_FLAGS --gpu-max-threads-per-block=1024)
 
 if(CMAKE_BUILD_TYPE MATCHES Debug)
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index c617f6e56c994..2d4528fa3316f 100755
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -285,6 +285,14 @@ else()
       inference_lib_dist
       SRCS ${paddle_phi_lib}
       DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
+    if(WITH_GPU OR WITH_ROCM)
+      set(paddle_phi_kernel_gpu_lib
+          ${PADDLE_BINARY_DIR}/paddle/phi/libphi_kernel_gpu.*)
+      copy(
+        inference_lib_dist
+        SRCS ${paddle_phi_kernel_gpu_lib}
+        DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
+    endif()
   endif()
 endif()
 
diff --git a/cmake/make_resource.py b/cmake/make_resource.py
index ad8ee179d60c2..e80900da58777 100644
--- a/cmake/make_resource.py
+++ b/cmake/make_resource.py
@@ -24,7 +24,7 @@
     "const unsigned char "
     + var
     + "[] = {"
-    + ",".join(["0x%02x" % ord(c) for c in open(res).read()])
+    + ",".join([f"0x{ord(c):02x}" for c in open(res).read()])
     + ",0};\n"
     + "const unsigned "
     + var
diff --git a/paddle/.set_python_path.sh b/paddle/.set_python_path.sh
index 8fd58925ee482..67e6304bf3d2e 100755
--- a/paddle/.set_python_path.sh
+++ b/paddle/.set_python_path.sh
@@ -14,11 +14,11 @@
 # limitations under the License.
 
 #
-# A simple test driver for cmake. 
+# A simple test driver for cmake.
 # set PYTHONPATH before run command.
 # Usage:
 #    ./.set_python_pash.sh -p YOUR_PYTHON_PATH {exec...}
-# 
+#
 # It same as PYTHONPATH=${YOUR_PYTHON_PATH}:$PYTHONPATH {exec...}
 #
 PYPATH=""
diff --git a/paddle/cinn/ast_gen_ius/ast_gen.cc b/paddle/cinn/ast_gen_ius/ast_gen.cc
index 42986fff0dbb1..54805f2c78f50 100644
--- a/paddle/cinn/ast_gen_ius/ast_gen.cc
+++ b/paddle/cinn/ast_gen_ius/ast_gen.cc
@@ -131,6 +131,7 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) {
       } else {
         iter_values.push_back(axis_vars[i]);
       }
+      ir::TryElevateInt32ToInt64({ir::Expr(axis_vars[i]), shape[i]});
     }
     VLOG(4) << "iter_value.size() and block_vars.size() is "
             << iter_values.size() << " " << block_vars.size();
@@ -167,6 +168,7 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) {
       } else {
         reduce_iter_values.push_back(axis_vars[i]);
       }
+      ir::TryElevateInt32ToInt64({ir::Expr(axis_vars[i]), shape[i]});
     }
     VLOG(4) << "ast gen: reduce body is after replace 0" << reduce_body;
     for (int i = 0; i < reduce_axis.size(); ++i) {
@@ -227,6 +229,9 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) {
         ir::ScheduleBlock::Make(
             reduce_block_vars, {}, {}, tensor->name, reduce_body));
     for (int i = static_cast<int>(reduce_axis.size()) - 1; i >= 0; --i) {
+      ir::TryElevateInt32ToInt64({reduce_axis[i],
+                                  reduce_axis[i]->lower_bound,
+                                  reduce_axis[i]->upper_bound});
       reduce_body = ir::For::Make(reduce_axis[i],
                                   reduce_axis[i]->lower_bound,
                                   reduce_axis[i]->upper_bound,
diff --git a/paddle/cinn/auto_schedule/analysis/analyze_ir.cc b/paddle/cinn/auto_schedule/analysis/analyze_ir.cc
index 6f00ee34813d1..c51ba89806956 100644
--- a/paddle/cinn/auto_schedule/analysis/analyze_ir.cc
+++ b/paddle/cinn/auto_schedule/analysis/analyze_ir.cc
@@ -32,7 +32,7 @@
 #include "paddle/cinn/lang/lower.h"
 #include "paddle/cinn/optim/optimize.h"
 #include "paddle/cinn/optim/transform_gpu_forloop.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace auto_schedule {
 
@@ -193,10 +193,14 @@ ir::LoweredFunc UpdateFuncWithNewBody(const cinn::common::Target& target,
 std::unordered_set<std::string> GetReduceLoopVarNames(const ir::Expr block) {
   const ir::ScheduleBlockRealize* block_realize =
       block.As<ir::ScheduleBlockRealize>();
-  CHECK_NOTNULL(block_realize);
+  PADDLE_ENFORCE_NOT_NULL(
+      block_realize,
+      phi::errors::InvalidArgument("The block is not a ScheduleBlockRealize"));
   const ir::ScheduleBlock* block_node =
       block_realize->schedule_block.As<ir::ScheduleBlock>();
-  CHECK_NOTNULL(block_node);
+  PADDLE_ENFORCE_NOT_NULL(
+      block_node,
+      phi::errors::InvalidArgument("The block is not a ScheduleBlock"));
   std::vector<ir::Expr> iter_values = block_realize->iter_values;
   std::vector<ir::Var> iter_vars = block_node->iter_vars;
 
@@ -218,10 +222,14 @@ std::unordered_set<std::string> GetReduceLoopVarNames(const ir::Expr block) {
 std::string GetBlockName(const ir::Expr block) {
   const ir::ScheduleBlockRealize* block_realize =
       block.As<ir::ScheduleBlockRealize>();
-  CHECK_NOTNULL(block_realize);
+  PADDLE_ENFORCE_NOT_NULL(
+      block_realize,
+      phi::errors::InvalidArgument("The block is not a ScheduleBlockRealize"));
   const ir::ScheduleBlock* block_node =
       block_realize->schedule_block.As<ir::ScheduleBlock>();
-  CHECK_NOTNULL(block_node);
+  PADDLE_ENFORCE_NOT_NULL(
+      block_node,
+      phi::errors::InvalidArgument("The block is not a ScheduleBlock"));
   return block_node->name;
 }
 
diff --git a/paddle/cinn/auto_schedule/auto_tuner.cc b/paddle/cinn/auto_schedule/auto_tuner.cc
index d45dcc743e525..9524e1ed3048f 100644
--- a/paddle/cinn/auto_schedule/auto_tuner.cc
+++ b/paddle/cinn/auto_schedule/auto_tuner.cc
@@ -34,7 +34,7 @@
 #include "paddle/cinn/hlir/framework/op.h"
 #include "paddle/cinn/hlir/framework/visualize_helper.h"
 #include "paddle/cinn/utils/string.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace auto_schedule {
 
@@ -144,9 +144,10 @@ void PrintResult(const TuningResult& result) {
 }
 
 TuningResult AutoTuner::Tune(const TuningOptions& options) {
-  CHECK_GT(options.num_tuning_rounds, 0) << "Invalid config";
-  VLOG(3) << "Begin tuning with round num=" << options.num_tuning_rounds
-          << ", tasks size=" << tasks_.size();
+  PADDLE_ENFORCE_GT(options.num_tuning_rounds,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The num_tuning_rounds should be greater than 0."));
 
   TuningResult result;
   result.subgraphs.resize(tasks_.size());
diff --git a/paddle/cinn/auto_schedule/cost_model/expr_cost_model.cc b/paddle/cinn/auto_schedule/cost_model/expr_cost_model.cc
index a9074c76fa8cf..54396ecaa6e2e 100644
--- a/paddle/cinn/auto_schedule/cost_model/expr_cost_model.cc
+++ b/paddle/cinn/auto_schedule/cost_model/expr_cost_model.cc
@@ -24,7 +24,7 @@
 #include "paddle/cinn/auto_schedule/search_space/search_state.h"
 #include "paddle/cinn/common/target.h"
 #include "paddle/cinn/ir/schedule/ir_schedule.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace auto_schedule {
 
@@ -45,8 +45,10 @@ void ExprCostModel::Train(const std::vector<const ir::ModuleExpr*>& samples,
                           const cinn::common::Target& target) {
   trained_times_.store(1);
   size_t total_size = samples.size();
-  CHECK_EQ(total_size, labels.size())
-      << "Samples must have same size as labels";
+  PADDLE_ENFORCE_EQ(
+      total_size,
+      labels.size(),
+      phi::errors::InvalidArgument("Samples must have same size as labels"));
   std::vector<std::vector<float>> train_feature_numbers(total_size);
   FeatureExtractor extractor;
   for (size_t i = 0; i < total_size; ++i) {
@@ -63,8 +65,10 @@ void ExprCostModel::Update(const std::vector<const ir::ModuleExpr*>& samples,
                            const cinn::common::Target& target) {
   ++trained_times_;
   size_t total_size = samples.size();
-  CHECK_EQ(total_size, labels.size())
-      << "Samples must have same size as labels";
+  PADDLE_ENFORCE_EQ(
+      total_size,
+      labels.size(),
+      phi::errors::InvalidArgument("Samples must have same size as labels"));
   std::vector<std::vector<float>> train_feature_numbers(total_size);
   FeatureExtractor extractor;
   for (size_t i = 0; i < total_size; ++i) {
diff --git a/paddle/cinn/auto_schedule/database/database.cc b/paddle/cinn/auto_schedule/database/database.cc
index 2036b44a83fef..ee8277b9dadd6 100644
--- a/paddle/cinn/auto_schedule/database/database.cc
+++ b/paddle/cinn/auto_schedule/database/database.cc
@@ -22,7 +22,7 @@
 #include "paddle/cinn/auto_schedule/task/task_registry.h"
 #include "paddle/cinn/ir/schedule/ir_schedule.h"
 #include "paddle/cinn/ir/schedule/schedule_desc.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace auto_schedule {
 
@@ -42,8 +42,10 @@ proto::TuningRecord TuningRecord::ToProto() const {
 
 Database::Database(int capacity_per_task)
     : capacity_per_task_(capacity_per_task) {
-  CHECK_GT(capacity_per_task_, 0)
-      << "capacity_per_task_ should be greater than 0";
+  PADDLE_ENFORCE_GT(capacity_per_task_,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "capacity_per_task_ should be greater than 0"));
 }
 
 std::unique_ptr<Database> Database::Make(const DatabaseConfig& config) {
diff --git a/paddle/cinn/auto_schedule/measure/simple_builder.cc b/paddle/cinn/auto_schedule/measure/simple_builder.cc
index 5be5b8528616f..0636cfc2b79fa 100644
--- a/paddle/cinn/auto_schedule/measure/simple_builder.cc
+++ b/paddle/cinn/auto_schedule/measure/simple_builder.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/cinn/auto_schedule/measure/simple_builder.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace auto_schedule {
 
@@ -25,8 +25,10 @@ SimpleBuilder::SimpleBuilder(hlir::framework::GraphCompiler* graph_compiler)
     : graph_compiler_(graph_compiler) {}
 
 BuildResult SimpleBuilder::Build(const MeasureInput& input) {
-  CHECK_NE(graph_compiler_, static_cast<GraphCompiler*>(nullptr))
-      << "empty handle to GraphCompiler";
+  PADDLE_ENFORCE_NE(
+      graph_compiler_,
+      static_cast<GraphCompiler*>(nullptr),
+      phi::errors::InvalidArgument("empty handle to GraphCompiler"));
   CompilationContext& context = graph_compiler_->GetCompilationContext();
   context.groups.emplace_back(input.task->subgraph);
   context.lowered_funcs.emplace_back(input.lowered_funcs);
diff --git a/paddle/cinn/auto_schedule/measure/simple_runner.cc b/paddle/cinn/auto_schedule/measure/simple_runner.cc
index 92dcc00693b5b..ec3929aff71ae 100644
--- a/paddle/cinn/auto_schedule/measure/simple_runner.cc
+++ b/paddle/cinn/auto_schedule/measure/simple_runner.cc
@@ -25,7 +25,7 @@
 #include "paddle/cinn/hlir/framework/buffer.h"
 #include "paddle/cinn/hlir/framework/scope.h"
 #include "paddle/cinn/hlir/framework/tensor.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace auto_schedule {
 
@@ -76,8 +76,11 @@ static void PopulateRandomValue(const cinn::common::Type& type,
     std::generate_n(
         fmt_ptr, numel, [&engine, &dist]() { return dist(engine); });
   } else {
-    CHECK_EQ(type.bytes(), 8)
-        << "Unsupported type: " << type << ", type.bytes = " << type.bytes();
+    PADDLE_ENFORCE_EQ(
+        type.bytes(),
+        8,
+        phi::errors::Unimplemented("Unsupported type, the type.bytes is %d",
+                                   type.bytes()));
     auto* fmt_ptr = reinterpret_cast<uint8_t*>(raw_ptr);
     std::uniform_int_distribution<uint8_t> dist(
         std::numeric_limits<uint8_t>::min(),
@@ -127,7 +130,12 @@ static std::unordered_set<std::string> ParamsNeedInitWithZero(
       std::vector<int> param_idxs = kInitWithZeroParams.at(node->op()->name);
       const auto& inlinks = node->inlinks_in_order();
       for (int param_idx : param_idxs) {
-        CHECK_GT(inlinks.size(), param_idx);
+        PADDLE_ENFORCE_GT(inlinks.size(),
+                          param_idx,
+                          phi::errors::InvalidArgument(
+                              "The input size of the node is less than the "
+                              "index of the parameter that needs to be "
+                              "initialized to 0"));
         auto& edge = inlinks.at(param_idx);
         std::string param_name =
             edge->source()->as<hlir::framework::NodeData>()->id();
@@ -141,7 +149,10 @@ static std::unordered_set<std::string> ParamsNeedInitWithZero(
 }
 
 SimpleRunner::SimpleRunner(int repeat_times) : repeat_times_(repeat_times) {
-  CHECK_GT(repeat_times_, 0) << "repeat_times can't less than 0";
+  PADDLE_ENFORCE_GT(
+      repeat_times_,
+      0,
+      phi::errors::InvalidArgument("repeat_times should be greater than 0"));
 }
 
 // Prepare execution arguments of all instructions to run, a argument
diff --git a/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process.cc b/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process.cc
index 2e3c4b0e21661..ffc8a0f21d903 100644
--- a/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process.cc
+++ b/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process.cc
@@ -18,7 +18,7 @@
 #include "paddle/cinn/ir/ir_printer.h"
 #include "paddle/cinn/ir/schedule/ir_schedule.h"
 #include "paddle/cinn/ir/schedule/schedule_desc.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace auto_schedule {
 
@@ -29,7 +29,10 @@ int ExtractNumThreads(const ir::IRSchedule& ir_schedule,
     if (step.type == "Bind" &&
         step.attrs.find("thread_axis") != step.attrs.end() &&
         absl::get<std::string>(step.attrs.at("thread_axis")) == bind_axis) {
-      CHECK_EQ(step.inputs.at("loop").size(), 1);
+      PADDLE_ENFORCE_EQ(step.inputs.at("loop").size(),
+                        1,
+                        phi::errors::InvalidArgument(
+                            "The loop size of bind step should be 1"));
       return step.inputs.at("loop")[0].As<ir::For>()->extent.as_int32();
     }
   }
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc
index e59ba8b423293..523763942c64e 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc
@@ -21,7 +21,7 @@
 #include "paddle/cinn/ir/schedule_block_graph.h"
 #include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/ir/utils/ir_nodes_collector.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace auto_schedule {
 
@@ -40,8 +40,11 @@ bool IsSpatialLoop(const ir::For* for_node) {
         const auto* schedule_block =
             block_realize->schedule_block.As<ir::ScheduleBlock>();
         CHECK(schedule_block) << "schedule_block field is not a ScheduleBlock";
-        CHECK_EQ(block_realize->iter_values.size(),
-                 schedule_block->iter_vars.size());
+        PADDLE_ENFORCE_EQ(
+            block_realize->iter_values.size(),
+            schedule_block->iter_vars.size(),
+            phi::errors::InvalidArgument(
+                "The size of iter_values and iter_vars should be equal."));
         for (int i = 0; i < block_realize->iter_values.size(); ++i) {
           const ir::Var& iter_var = schedule_block->iter_vars[i];
           const ir::Expr& binding = block_realize->iter_values[i];
@@ -93,10 +96,16 @@ void BindGPUIndex(ir::IRSchedule* ir_schedule,
                   int max_blocks,
                   int max_threads_per_block) {
   auto all_loops = ir_schedule->GetLoops(block_name);
-  CHECK_LE(num_loops_to_bind, all_loops.size())
-      << "The number of loops to be bind is greater than size of all_loops";
-  CHECK_GE(num_loops_to_bind, 0)
-      << "The number of loops to be bind should be greater than 0";
+  PADDLE_ENFORCE_LE(
+      num_loops_to_bind,
+      all_loops.size(),
+      phi::errors::InvalidArgument(
+          "The number of loops to be bind is greater than size of all_loops"));
+  PADDLE_ENFORCE_GE(
+      num_loops_to_bind,
+      0,
+      phi::errors::InvalidArgument(
+          "The number of loops to be bind should be greater than 0"));
   // check whether it is the case that threadIdx has been binded but blockIdx
   // not, the threadIdx can only be binded in the first loop after
   // num_loops_to_bind loops because we has excluded other cases in
@@ -130,13 +139,19 @@ void BindGPUIndex(ir::IRSchedule* ir_schedule,
 
   if (extent <= max_blocks * max_threads_per_block) {
     auto splits = ir_schedule->Split(fused_loop, {-1, max_threads_per_block});
-    CHECK_EQ(splits.size(), 2);
+    PADDLE_ENFORCE_EQ(
+        splits.size(),
+        2,
+        phi::errors::InvalidArgument("The size of splits should be 2."));
     ir_schedule->Bind(splits[0], "blockIdx.x");
     ir_schedule->Bind(splits[1], "threadIdx.x");
   } else {
     auto splits =
         ir_schedule->Split(fused_loop, {-1, max_blocks, max_threads_per_block});
-    CHECK_EQ(splits.size(), 3);
+    PADDLE_ENFORCE_EQ(
+        splits.size(),
+        3,
+        phi::errors::InvalidArgument("The size of splits should be 3."));
     ir_schedule->Reorder({splits[1], splits[2], splits[0]});
     all_loops = ir_schedule->GetLoops(block_name);
     ir_schedule->Bind(all_loops[0], "blockIdx.x");
@@ -160,8 +175,11 @@ RuleApplyType AutoBind::Init(ir::IRSchedule* ir_schedule) {
 }
 
 void AutoBind::Apply(int index) {
-  CHECK_LT(index, applicable_schedule_blocks_.size())
-      << "invalid apply index:" << index;
+  PADDLE_ENFORCE_LT(
+      index,
+      applicable_schedule_blocks_.size(),
+      phi::errors::InvalidArgument(
+          "The index should be less than size of applicable_schedule_blocks_"));
   auto applied_block = applicable_schedule_blocks_.at(index);
   auto all_loops = ir_schedule_->GetLoops(applied_block);
   BindGPUIndex(ir_schedule_,
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.cc
index e52d91c125224..ef0dbef492a59 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.cc
@@ -20,7 +20,7 @@
 
 #include "paddle/cinn/common/target.h"
 #include "paddle/cinn/ir/schedule/ir_schedule.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace auto_schedule {
 
@@ -28,16 +28,19 @@ AutoGenRule::AutoGenRule(const cinn::common::Target& target)
     : target_(&target) {}
 
 int AutoGenRule::NumberApplicable() const {
-  CHECK_GE(num_applicable_, 0)
-      << "Call " << GetRuleName()
-      << "::NumberApplicable() without initialization.";
+  PADDLE_ENFORCE_GE(
+      num_applicable_,
+      0,
+      phi::errors::InvalidArgument(
+          "The num_applicable_ should be greater than or equal to 0."));
   return num_applicable_;
 }
 
 void AutoGenRule::ApplyRandomly() {
-  CHECK_GT(num_applicable_, 0)
-      << "Call " << GetRuleName()
-      << "::ApplyRandomly() with NumberApplicable() == 0";
+  PADDLE_ENFORCE_GT(num_applicable_,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The num_applicable_ should be greater than 0."));
   int index = rand() % num_applicable_;  // NOLINT
   return Apply(index);
 }
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.cc
index c052d2995c8ad..a4ecd5036e2e7 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.cc
@@ -22,7 +22,7 @@
 #include "paddle/cinn/ir/schedule/ir_schedule.h"
 #include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/ir/utils/ir_nodes_collector.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace auto_schedule {
 
@@ -97,8 +97,9 @@ RuleApplyType AutoUnroll::Init(ir::IRSchedule* ir_schedule) {
 }
 
 void AutoUnroll::Apply(int index) {
-  CHECK_LT(index, applicable_schedule_blocks_.size())
-      << "invalid apply index:" << index;
+  PADDLE_ENFORCE_LT(index,
+                    applicable_schedule_blocks_.size(),
+                    phi::errors::InvalidArgument("Index is out of range."));
   auto applied_block = applicable_schedule_blocks_.at(index);
   int max_step = auto_unroll_options[std::rand() % auto_unroll_options.size()];
   ir_schedule_->Annotate(
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h
index 1bbc8da4497d6..759dbfa54d3a4 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h
@@ -27,7 +27,7 @@
 #include "paddle/cinn/ir/ir.h"
 #include "paddle/cinn/ir/ir_base.h"
 #include "paddle/cinn/ir/schedule/ir_schedule.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace auto_schedule {
 
@@ -103,8 +103,11 @@ class MultiLevelTiling : public AutoGenRule {
   // Sample num_split integers whose product equals extent
   template <typename T>
   std::vector<T> SampleTileSplit(T extent, int num_split) const {
-    CHECK_GT(num_split, 0)
-        << "num_split in SampleTileSplit must be greater than 0";
+    PADDLE_ENFORCE_GT(
+        num_split,
+        0,
+        phi::errors::InvalidArgument(
+            "num_split in SampleTileSplit must be greater than 0"));
     if (num_split == 1) {
       return {extent};
     }
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring.cc
index 85bc207c84fc7..0053c87a81394 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring.cc
@@ -23,7 +23,7 @@
 #include "paddle/cinn/ir/tensor.h"
 #include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/ir/utils/ir_nodes_collector.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace auto_schedule {
 
@@ -32,10 +32,16 @@ bool ReductionFactoring::CanApply(const std::string& block_name,
   ir::Expr block_expr = ir_schedule->GetBlock(block_name);
   ir::ScheduleBlockRealize* block_realize =
       block_expr.As<ir::ScheduleBlockRealize>();
-  CHECK_NOTNULL(block_realize);
+  PADDLE_ENFORCE_NOT_NULL(
+      block_realize,
+      phi::errors::InvalidArgument(
+          "The block_expr should be a ScheduleBlockRealize."));
   ir::ScheduleBlock* sch_block =
       block_realize->schedule_block.As<ir::ScheduleBlock>();
-  CHECK_NOTNULL(sch_block);
+  PADDLE_ENFORCE_NOT_NULL(
+      sch_block,
+      phi::errors::InvalidArgument(
+          "The schedule_block field is not a ScheduleBlock."));
   AnalyzeScheduleBlockReadWriteBuffer(sch_block);
 
   // 1. The block must have write buffer
@@ -135,7 +141,11 @@ void ReductionFactoring::Apply(const std::string& block_name,
     return;
   }
   // 3. Reorder if new_loop_order differs from the original order
-  CHECK_EQ(all_loops.size(), new_loop_order.size());
+  PADDLE_ENFORCE_EQ(
+      all_loops.size(),
+      new_loop_order.size(),
+      phi::errors::InvalidArgument("The size of all_loops should be equal to "
+                                   "the size of new_loop_order."));
   for (int i = 0; i < all_loops.size(); ++i) {
     if (all_loops[i].As<ir::For>()->loop_var->name !=
         new_loop_order[i].As<ir::For>()->loop_var->name) {
@@ -152,7 +162,11 @@ void ReductionFactoring::Apply(const std::string& block_name,
     for (int i = num_spatial_loops; i < all_loops.size(); ++i) {
       reduction_loop_indices.push_back(i);
     }
-    CHECK_EQ(reduction_loop_indices.size(), num_reduction_loops);
+    PADDLE_ENFORCE_EQ(reduction_loop_indices.size(),
+                      num_reduction_loops,
+                      phi::errors::InvalidArgument(
+                          "The size of reduction_loop_indices should be equal "
+                          "to num_reduction_loops."));
     fused_reduce_loop = ir_schedule->Fuse(block_name, reduction_loop_indices);
   } else {
     all_loops = ir_schedule->GetLoops(block_name);
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring_test.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring_test.cc
index d56d97f83df60..fb327c130dbbf 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring_test.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring_test.cc
@@ -23,8 +23,8 @@
 
 #include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.h"
 #include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/common/enforce.h"
 #include "test/cpp/cinn/concrete_program_builder.h"
-
 PD_DECLARE_bool(cinn_new_group_scheduler);
 
 namespace cinn {
@@ -64,8 +64,13 @@ class TestReductionFactoring : public TestAutoGenRuleBase {
 
     // check
     const std::vector<ir::Expr>& blocks = ir_schedule.GetAllBlocks();
-    CHECK_EQ(blocks.size(), 2UL);
-    CHECK_EQ(ir.str(), expected_ir);
+    PADDLE_ENFORCE_EQ(
+        blocks.size(),
+        2UL,
+        phi::errors::InvalidArgument("The size of blocks should be 2."));
+    PADDLE_ENFORCE_EQ(ir.str(),
+                      expected_ir,
+                      phi::errors::InvalidArgument("The ir is not correct."));
   }
 };
 
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.cc
index 994027dba0ee4..66d25c65542d1 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.cc
@@ -18,7 +18,6 @@
 #include <gtest/gtest.h>
 #include <memory.h>
 #include <stdlib.h>
-
 #include "paddle/cinn/auto_schedule/analysis/analyze_ir.h"
 #include "paddle/cinn/backends/codegen_cuda_dev.h"
 #include "paddle/cinn/cinn.h"
@@ -29,6 +28,7 @@
 #include "paddle/cinn/hlir/framework/pass.h"
 #include "paddle/cinn/hlir/framework/tensor.h"
 #include "paddle/cinn/optim/transform_gpu_forloop.h"
+#include "paddle/common/enforce.h"
 #ifdef CINN_WITH_CUDA
 #include <cuda_runtime.h>
 #endif
@@ -89,8 +89,10 @@ std::string TestAutoGenRuleBase::GetIR(const ir::IRSchedule& schedule) {
 
 ir::Module TestAutoGenRuleBase::BuildIRModule(const ir::IRSchedule& schedule) {
   auto&& updated_bodys = schedule.GetModule().GetExprs();
-  CHECK_EQ(lowered_funcs_.size(), updated_bodys.size())
-      << "associated exprs size not equal";
+  PADDLE_ENFORCE_EQ(
+      lowered_funcs_.size(),
+      updated_bodys.size(),
+      phi::errors::InvalidArgument("Associated exprs size not equal"));
 
   ir::Module::Builder builder("test_builder", this->target_);
   for (int i = 0; i < lowered_funcs_.size(); ++i) {
@@ -175,10 +177,16 @@ void CheckResult(raw_func_type test_func,
                  const cinn::common::Target& target) {
   CHECK(input_names.size()) << "The number of inputs must be greater than 0.";
   CHECK(output_names.size()) << "The number of outputs must be greater than 0.";
-  CHECK_EQ(input_names.size(), input_shapes.size())
-      << "The quantity of input_names and input_shapes must be equal.";
-  CHECK_EQ(output_names.size(), output_shapes.size())
-      << "The quantity of output_names and output_shapes must be equal.";
+  PADDLE_ENFORCE_EQ(
+      input_names.size(),
+      input_shapes.size(),
+      phi::errors::InvalidArgument(
+          "The quantity of input_names and input_shapes must be equal."));
+  PADDLE_ENFORCE_EQ(
+      output_names.size(),
+      output_shapes.size(),
+      phi::errors::InvalidArgument(
+          "The quantity of output_names and output_shapes must be equal."));
 
   // Initialize data
   std::vector<float*> input_data_ptrs(input_names.size());
diff --git a/paddle/cinn/auto_schedule/search_space/block_sampler.cc b/paddle/cinn/auto_schedule/search_space/block_sampler.cc
index 93de31e6a5e36..38d3b7badd02a 100644
--- a/paddle/cinn/auto_schedule/search_space/block_sampler.cc
+++ b/paddle/cinn/auto_schedule/search_space/block_sampler.cc
@@ -17,7 +17,7 @@
 #include <algorithm>
 
 #include "paddle/cinn/ir/ir.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace auto_schedule {
 
@@ -27,7 +27,10 @@ std::unique_ptr<BlockSampler> BlockSampler::Make(
     const std::string& strategy,
     utils::LinearRandomEngine::StateType rand_seed,
     const std::vector<int>& weights) {
-  CHECK_GT(all_blocks.size(), 0) << "Empty block list";
+  PADDLE_ENFORCE_GT(
+      all_blocks.size(),
+      0,
+      phi::errors::InvalidArgument("The all_blocks should not empty."));
   if (strategy == "traversal") {
     VLOG(6) << "Init TraversalBlockSampler with block num = "
             << all_blocks.size();
@@ -87,7 +90,11 @@ ProbabilisticBlockSampler::ProbabilisticBlockSampler(
   if (weights.empty()) {
     weights_.resize(all_blocks.size(), 1);
   } else {
-    CHECK_EQ(all_blocks.size(), weights_.size());
+    PADDLE_ENFORCE_EQ(
+        all_blocks.size(),
+        weights_.size(),
+        phi::errors::InvalidArgument(
+            "The size of all_blocks and weights should be equal."));
   }
   remains_ = all_blocks.size();
 }
diff --git a/paddle/cinn/auto_schedule/search_space/rule_sampler.cc b/paddle/cinn/auto_schedule/search_space/rule_sampler.cc
index 3c0868d0748e5..bd8e818546a91 100644
--- a/paddle/cinn/auto_schedule/search_space/rule_sampler.cc
+++ b/paddle/cinn/auto_schedule/search_space/rule_sampler.cc
@@ -16,7 +16,7 @@
 
 #include <algorithm>
 #include <random>
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace auto_schedule {
 
@@ -26,7 +26,10 @@ std::unique_ptr<RuleSampler> RuleSampler::Make(
     const std::string& strategy,
     utils::LinearRandomEngine::StateType rand_seed,
     const std::vector<int>& weights) {
-  CHECK_GT(potential_rules.size(), 0) << "Empty rule list";
+  PADDLE_ENFORCE_GT(
+      potential_rules.size(),
+      0,
+      phi::errors::InvalidArgument("The potential_rules should not be empty."));
   if (strategy == "traversal") {
     return std::make_unique<TraversalRuleSampler>(potential_rules,
                                                   default_remove_policy);
@@ -64,7 +67,11 @@ ProbabilisticRuleSampler::ProbabilisticRuleSampler(
   if (weights.empty()) {
     weights_.resize(potential_rules.size(), 1);
   } else {
-    CHECK_EQ(potential_rules.size(), weights_.size());
+    PADDLE_ENFORCE_EQ(
+        potential_rules.size(),
+        weights_.size(),
+        phi::errors::InvalidArgument(
+            "Potential_rules's size should same as weights's size."));
   }
   remains_ = potential_rules.size();
 }
diff --git a/paddle/cinn/auto_schedule/search_space/search_space.cc b/paddle/cinn/auto_schedule/search_space/search_space.cc
index 650e1d572f831..a4f4db6472e1b 100644
--- a/paddle/cinn/auto_schedule/search_space/search_space.cc
+++ b/paddle/cinn/auto_schedule/search_space/search_space.cc
@@ -33,7 +33,7 @@
 #include "paddle/cinn/ir/schedule/ir_schedule.h"
 #include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/runtime/flags.h"
-
+#include "paddle/common/enforce.h"
 PD_DECLARE_bool(auto_schedule_use_cost_model);
 
 namespace cinn {
@@ -109,7 +109,10 @@ SearchState SearchSpace::RandomScheduleMutate(const SearchState& state) {
   --iter;
 
   int sample_rule_index = iter->second;
-  CHECK_LT(sample_rule_index, ret->applicable_rules.size());
+  PADDLE_ENFORCE_LT(sample_rule_index,
+                    ret->applicable_rules.size(),
+                    phi::errors::InvalidArgument(
+                        "The sample_rule_index should less than ret's."));
   AutoGenRule* sample_rule = ret->applicable_rules.at(sample_rule_index);
   VLOG(7) << "Apply rule: " << sample_rule->GetRuleName()
           << " with index=" << sample_weighted_index - iter->first;
diff --git a/paddle/cinn/auto_schedule/search_strategy/evolutionary_search.cc b/paddle/cinn/auto_schedule/search_strategy/evolutionary_search.cc
index dcb6e1ca93914..6403283f18be1 100644
--- a/paddle/cinn/auto_schedule/search_strategy/evolutionary_search.cc
+++ b/paddle/cinn/auto_schedule/search_strategy/evolutionary_search.cc
@@ -35,7 +35,7 @@
 #include "paddle/cinn/utils/multi_threading.h"
 #include "paddle/cinn/utils/sized_multi_set.h"
 #include "paddle/cinn/utils/string.h"
-
+#include "paddle/common/enforce.h"
 PD_DECLARE_bool(auto_schedule_use_cost_model);
 
 namespace cinn {
@@ -175,9 +175,11 @@ SearchState EvolutionarySearch::CrossOver(const SearchState& state1,
   std::vector<ir::Expr> mother_exprs =
       state2->ir_schedule.GetModule().GetExprs();
 
-  CHECK_EQ(father_exprs.size(), mother_exprs.size())
-      << "CrossOver ModuleExpr in EvolutionarySearch must have same number of "
-         "AST";
+  PADDLE_ENFORCE_EQ(father_exprs.size(),
+                    mother_exprs.size(),
+                    phi::errors::InvalidArgument(
+                        "CrossOver ModuleExpr in EvolutionarySearch must have "
+                        "same number of AST"));
 
   for (size_t i = 0; i < father_exprs.size(); ++i) {
     if (utils::SampleUniformInt(0, 2, &rand_seed_) == 0) {
@@ -200,10 +202,15 @@ SearchState EvolutionarySearch::CrossOver(const SearchState& state1,
 
 SearchState EvolutionarySearch::Mutate(
     const SearchState& state, utils::LinearRandomEngine::StateType* rand_seed) {
-  CHECK_GT(weighted_mutators_.size(), 0)
-      << "There is no mutate rule can be applied.";
+  PADDLE_ENFORCE_GT(
+      weighted_mutators_.size(),
+      0,
+      phi::errors::InvalidArgument("There is no mutate rule can be applied."));
   double accu_weight = (weighted_mutators_.rbegin())->first;
-  CHECK_GT(accu_weight, 0) << "The accumulate weight must be greater than 0.";
+  PADDLE_ENFORCE_GT(accu_weight,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The accumulate weight must be greater than 0."));
   // sample a mutate rule
   double sample_weight = utils::SampleUniformDouble(0, accu_weight, rand_seed);
   auto sampled_iter = weighted_mutators_.upper_bound(sample_weight);
diff --git a/paddle/cinn/auto_schedule/search_strategy/evolutionary_search_test.cc b/paddle/cinn/auto_schedule/search_strategy/evolutionary_search_test.cc
index 6a983d7f9aaac..7791cdf9f89d5 100644
--- a/paddle/cinn/auto_schedule/search_strategy/evolutionary_search_test.cc
+++ b/paddle/cinn/auto_schedule/search_strategy/evolutionary_search_test.cc
@@ -30,8 +30,8 @@
 #include "paddle/cinn/hlir/framework/op_lowering.h"
 #include "paddle/cinn/ir/ir_base.h"
 #include "paddle/cinn/ir/schedule/ir_schedule.h"
+#include "paddle/common/enforce.h"
 #include "test/cpp/cinn/program_builder.h"
-
 namespace cinn {
 namespace auto_schedule {
 
@@ -159,7 +159,10 @@ TEST(EvolutionarySearch, Evolve) {
   auto tasks = CreateTasks(
       tests::OpBuilder("matmul").Build({{"X", {32, 32}}, {"Y", {32, 32}}}),
       target);
-  CHECK_EQ(tasks.size(), 1);
+  PADDLE_ENFORCE_EQ(
+      tasks.size(),
+      1,
+      phi::errors::InvalidArgument("The size of tasks should be 1."));
   ExprCostModel cost_model;
   std::vector<const ir::ModuleExpr*> cost_model_samples(1);
   std::vector<float> cost_model_labels(1);
@@ -206,7 +209,11 @@ TEST(EvolutionarySearch, Evolve) {
       VLOG(6) << "cost = " << s->predicted_cost;
     }
     VLOG(6) << "total_cost_next = " << total_cost_next;
-    CHECK_LE(total_cost_next, total_cost_pre);
+    PADDLE_ENFORCE_LE(
+        total_cost_next,
+        total_cost_pre,
+        phi::errors::InvalidArgument("The total cost should be less than or "
+                                     "equal to the previous one."));
     std::swap(population_pre_ptr, population_next_ptr);
   }
 }
diff --git a/paddle/cinn/auto_schedule/task/task_optimizer.cc b/paddle/cinn/auto_schedule/task/task_optimizer.cc
index 273cba4c4060e..a027dc9dd1ed5 100644
--- a/paddle/cinn/auto_schedule/task/task_optimizer.cc
+++ b/paddle/cinn/auto_schedule/task/task_optimizer.cc
@@ -18,7 +18,6 @@
 
 #include <functional>
 #include <limits>
-
 #include "paddle/cinn/auto_schedule/analysis/analyze_ir.h"
 #include "paddle/cinn/auto_schedule/cost_model/expr_cost_model.h"
 #include "paddle/cinn/auto_schedule/measure/measure.h"
@@ -34,6 +33,7 @@
 #include "paddle/cinn/optim/transform_gpu_forloop.h"
 #include "paddle/cinn/runtime/flags.h"
 #include "paddle/cinn/utils/string.h"
+#include "paddle/common/enforce.h"
 #ifdef CINN_WITH_CUDA
 #include <cuda_runtime_api.h>
 
@@ -223,9 +223,12 @@ bool IsWrappedByCustomCall(const TuneTask* task) {
 
 TaskOptimizer::Result TaskOptimizer::OptimizeByEvolution(
     const TuningOptions& options) {
-  CHECK_EQ(options.num_measure_trials % options.num_samples_per_iteration, 0)
-      << "TuningOptions.num_measure_trials % "
-         "TuningOptions.num_samples_per_iteration must be 0.";
+  PADDLE_ENFORCE_EQ(
+      options.num_measure_trials % options.num_samples_per_iteration,
+      0,
+      phi::errors::InvalidArgument(
+          "TuningOptions.num_measure_trials % "
+          "TuningOptions.num_samples_per_iteration must be 0."));
 
   VLOG(4) << "Optimizing TuneTask with num_measure_trials:"
           << options.num_measure_trials
@@ -290,9 +293,11 @@ TaskOptimizer::Result TaskOptimizer::OptimizeByEvolution(
             << measure_inputs.size();
     std::vector<MeasureResult> measure_outputs =
         schedule_measurer_->Measure(measure_inputs);
-    CHECK_EQ(measure_outputs.size(), states.size())
-        << "ScheduleMeasurer didn't output same number of MeasureOutput of "
-           "states in TaskOptimizer";
+    PADDLE_ENFORCE_EQ(measure_outputs.size(),
+                      states.size(),
+                      phi::errors::InvalidArgument(
+                          "ScheduleMeasurer didn't output same number of "
+                          "MeasureOutput of states in TaskOptimizer"));
     // record to database
     for (size_t i = 0; i < states.size(); ++i) {
       database_->AddRecord(TuningRecord(measure_inputs[i].task->serialized_key,
@@ -344,9 +349,11 @@ std::vector<SearchState> TaskOptimizer::SearchOneRound(
   for (size_t i = 0; i < states.size(); ++i) {
     std::vector<ir::Expr> best_exprs =
         states[i]->ir_schedule.GetModule().GetExprs();
-    CHECK_EQ(best_exprs.size(), task_->lowered_funcs.size())
-        << "RuntimeError: Expr size is not equal to LoweredFunc size in "
-           "TaskOptimizer";
+    PADDLE_ENFORCE_EQ(best_exprs.size(),
+                      task_->lowered_funcs.size(),
+                      phi::errors::InvalidArgument(
+                          "Expr size is not equal to LoweredFunc size in "
+                          "TaskOptimizer"));
     auto init_funcs = ir::ir_utils::IRCopy(task_->lowered_funcs);
     std::vector<ir::LoweredFunc> valid_funcs;
     for (size_t j = 0; j < best_exprs.size(); ++j) {
@@ -369,8 +376,11 @@ std::vector<SearchState> TaskOptimizer::SearchOneRound(
   }
 
   states.erase(states.begin() + valid_cnt, states.end());
-  CHECK_EQ(states.size(), measure_candidates->size())
-      << "result size of states not equal to measure_candidates";
+  PADDLE_ENFORCE_EQ(
+      states.size(),
+      measure_candidates->size(),
+      phi::errors::InvalidArgument(
+          "result size of states not equal to measure_candidates"));
   VLOG(4) << "EvolutionarySearch return size=" << states.size()
           << ", valid count=" << valid_cnt;
   VLOG(4) << JoinStatesDebugString("TaskOptimizer::SearchOneRound-Result",
diff --git a/paddle/cinn/auto_schedule/task_scheduler/task_scheduler.cc b/paddle/cinn/auto_schedule/task_scheduler/task_scheduler.cc
index a8961e45b980d..f59acbe612635 100644
--- a/paddle/cinn/auto_schedule/task_scheduler/task_scheduler.cc
+++ b/paddle/cinn/auto_schedule/task_scheduler/task_scheduler.cc
@@ -19,7 +19,7 @@
 #include "paddle/cinn/auto_schedule/task/tune_task.h"
 #include "paddle/cinn/auto_schedule/task_scheduler/efficiency_priority.h"
 #include "paddle/cinn/auto_schedule/task_scheduler/round_robin.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace auto_schedule {
 
@@ -27,7 +27,10 @@ std::unique_ptr<TaskScheduler> TaskScheduler::Make(
     const std::vector<TuneTask>& tasks,
     const Config& config,
     const std::string& strategy) {
-  CHECK_GT(tasks.size(), 0) << "Empty task list";
+  PADDLE_ENFORCE_GT(
+      tasks.size(),
+      0,
+      phi::errors::InvalidArgument("The task's size should greater than 0."));
   if (strategy == "round_robin") {
     return std::make_unique<RoundRobin>(tasks, config);
   } else if (strategy == "efficiency_priority") {
diff --git a/paddle/cinn/auto_schedule/tests/performance_comparison_test.cc b/paddle/cinn/auto_schedule/tests/performance_comparison_test.cc
index 2966467b3eda6..c9f2630ac6e8a 100644
--- a/paddle/cinn/auto_schedule/tests/performance_comparison_test.cc
+++ b/paddle/cinn/auto_schedule/tests/performance_comparison_test.cc
@@ -32,8 +32,8 @@
 #include "paddle/cinn/ir/ir_base.h"
 #include "paddle/cinn/runtime/flags.h"
 #include "paddle/cinn/utils/data_util.h"
+#include "paddle/common/enforce.h"
 #include "test/cpp/cinn/program_builder.h"
-
 /* This test is used as a tool to evaluate or compare performance of 3
  * schedules(no schedule, manual schedule, auto-schedule). One can specify which
  * schedules to be evaluated through `FLAGS_evaluate_knobs` and specify which
@@ -355,7 +355,10 @@ TEST_F(PerformanceTester, Gather) {
 
 // paddle model test
 TEST_F(PerformanceTester, ResNet50) {
-  CHECK_NE(FLAGS_resnet50_model_dir, "");
+  PADDLE_ENFORCE_NE(FLAGS_resnet50_model_dir,
+                    "",
+                    phi::errors::InvalidArgument(
+                        "The FLAGS_resnet50_model's dir should not be empty."));
   FLAGS_cinn_infer_model_version = 1.0;
   std::unordered_map<std::string, std::vector<int64_t>> feeds = {
       {"inputs", {batch_size, 3, 224, 224}}};
diff --git a/paddle/cinn/backends/codegen_c.cc b/paddle/cinn/backends/codegen_c.cc
index 85443b02c0a8c..07dc8421de6cc 100644
--- a/paddle/cinn/backends/codegen_c.cc
+++ b/paddle/cinn/backends/codegen_c.cc
@@ -26,7 +26,7 @@
 #include "paddle/cinn/runtime/cpu/thread_backend.h"
 #include "paddle/cinn/runtime/intrinsic.h"
 #include "paddle/cinn/utils/string.h"
-
+#include "paddle/common/enforce.h"
 //! Root of the builtin code.
 PD_DECLARE_string(cinn_x86_builtin_code_root);
 
@@ -205,7 +205,10 @@ void CodeGenC::Visit(const ir::For *op) {
     Expr num_task_var = Var("num_task");
     IrPrinter::Visit((op->extent + num_task_var - 1) / num_task_var);
     str_ += ";\n";
-    CHECK_EQ(min.as_int32(), 0);
+    PADDLE_ENFORCE_EQ(
+        min.as_int32(),
+        0,
+        phi::errors::InvalidArgument("The min of the for loop should be 0"));
     auto task_id = Var("task_id");
     auto n_per_task = Var("n_per_task");
     min = task_id * n_per_task;
@@ -370,7 +373,10 @@ void CodeGenC::PrintCallArgs(const ir::Call *op) {
 }
 
 void CodeGenC::PrintCall_buffer_malloc(const ir::Call *op) {
-  CHECK_EQ(op->read_args.size(), 2UL);
+  PADDLE_ENFORCE_EQ(
+      op->read_args.size(),
+      2UL,
+      phi::errors::InvalidArgument("The number of read_args should be 2"));
   str_ += op->name;
   str_ += "(";
   PrintCastExpr("void*", op->read_args[0]);
@@ -380,7 +386,10 @@ void CodeGenC::PrintCall_buffer_malloc(const ir::Call *op) {
 }
 
 void CodeGenC::PrintCall_cinn_pod_value_to_(const ir::Call *op) {
-  CHECK_EQ(op->read_args.size(), 1UL);
+  PADDLE_ENFORCE_EQ(
+      op->read_args.size(),
+      1UL,
+      phi::errors::InvalidArgument("The number of read_args should be 1"));
   str_ += op->name;
   str_ += "(";
   str_ += "&(";
@@ -390,7 +399,10 @@ void CodeGenC::PrintCall_cinn_pod_value_to_(const ir::Call *op) {
 }
 
 void CodeGenC::PrintCall_get_address(const ir::Call *op) {
-  CHECK_EQ(op->read_args.size(), 1UL);
+  PADDLE_ENFORCE_EQ(
+      op->read_args.size(),
+      1UL,
+      phi::errors::InvalidArgument("The number of read_args should be 1"));
   CHECK(op->write_args.empty());
   auto *read_var = op->read_args.front().as_var();
   auto *read_buf = op->read_args.front().as_buffer();
@@ -409,7 +421,10 @@ void CodeGenC::PrintCall_get_address(const ir::Call *op) {
 
 void CodeGenC::PrintCall_pod_values_to_array(const ir::Call *op) {
   CHECK(!op->read_args.empty());
-  CHECK_EQ(op->write_args.size(), 1UL);
+  PADDLE_ENFORCE_EQ(
+      op->write_args.size(),
+      1UL,
+      phi::errors::InvalidArgument("The number of write_args should be 1"));
   auto output_var = op->write_args.front().as_var_ref();
   CHECK(output_var.defined());
 
@@ -612,9 +627,12 @@ void CodeGenC::Visit(const ir::_LoweredFunc_ *op) {
 
   DoIndent();
 
-  CHECK_EQ(op->alloc_output_buffer_exprs.size(),
-           op->dealloc_output_buffer_exprs.size())
-      << "the count of allocation and deallocation expressions is not match";
+  PADDLE_ENFORCE_EQ(
+      op->alloc_output_buffer_exprs.size(),
+      op->dealloc_output_buffer_exprs.size(),
+      phi::errors::InvalidArgument(
+          "The count of allocation and deallocation expressions is not "
+          "match"));
 
   std::vector<Expr> new_body;
 
diff --git a/paddle/cinn/backends/codegen_c_x86.cc b/paddle/cinn/backends/codegen_c_x86.cc
index 394b61e35816d..06a9ff1fda2f9 100644
--- a/paddle/cinn/backends/codegen_c_x86.cc
+++ b/paddle/cinn/backends/codegen_c_x86.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/cinn/backends/codegen_c_x86.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace backends {
 
@@ -53,7 +53,11 @@ void CodeGenCX86::Visit(const ir::Load *op) {
 }
 
 void CodeGenCX86::Visit(const ir::Broadcast *op) {
-  CHECK_GT(op->type().lanes(), 1);
+  PADDLE_ENFORCE_GT(
+      op->type().lanes(),
+      1,
+      phi::errors::InvalidArgument(
+          "The lanes of the broadcast op should be greater than 1."));
   int bits = op->type().bits() * op->type().lanes();
 
   if (SupportsAVX512() && bits == 512) {
diff --git a/paddle/cinn/backends/codegen_c_x86.h b/paddle/cinn/backends/codegen_c_x86.h
index f0b040a94f1ae..bf90612292d20 100644
--- a/paddle/cinn/backends/codegen_c_x86.h
+++ b/paddle/cinn/backends/codegen_c_x86.h
@@ -18,7 +18,7 @@
 
 #include "paddle/cinn/backends/codegen_c.h"
 #include "paddle/cinn/ir/intrinsic_ops.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace backends {
 
@@ -114,8 +114,10 @@ void CodeGenCX86::VisitBinaryOp(const Op *op,
                                 Expr a,
                                 Expr b,
                                 const std::string &op_repr) {
-  CHECK_EQ(a.type(), b.type()) << " a is : " << a << ", and b is : " << b
-                               << ". op_repr is : " << op_repr;
+  PADDLE_ENFORCE_EQ(
+      a.type(),
+      b.type(),
+      phi::errors::InvalidArgument("The type of a and b should be the same."));
 
   // scalar.
   if (a.type().lanes() == 1) {
diff --git a/paddle/cinn/backends/codegen_cuda_dev.cc b/paddle/cinn/backends/codegen_cuda_dev.cc
index 9c19c6faffb73..919edfc680ca7 100644
--- a/paddle/cinn/backends/codegen_cuda_dev.cc
+++ b/paddle/cinn/backends/codegen_cuda_dev.cc
@@ -26,8 +26,8 @@
 #include "paddle/cinn/ir/op/ir_operators.h"
 #include "paddle/cinn/ir/utils/ir_verify.h"
 #include "paddle/cinn/optim/ir_simplify.h"
+#include "paddle/common/enforce.h"
 #include "paddle/common/errors.h"
-
 namespace cinn {
 namespace backends {
 
@@ -122,7 +122,8 @@ std::vector<Expr> FilterDeallocTempBuffers(const std::vector<Expr> &frees) {
   std::vector<Expr> filtered;
   for (const Expr &free : frees) {
     const ir::Free *op = free.As<ir::Free>();
-    CHECK_NOTNULL(op);
+    PADDLE_ENFORCE_NOT_NULL(
+        op, phi::errors::InvalidArgument("Free is not a free node"));
     bool has_symbolic_constant = false;
     const ir::_Buffer_ *buffer = op->destination.As<ir::_Buffer_>();
     for (Expr shape : buffer->shape) {
@@ -305,7 +306,10 @@ std::string CodeGenCUDA_Dev::Compile(const ir::Module &module,
 void CodeGenCUDA_Dev::PrintIncludes() { str_ += GetSourceHeader(); }
 
 void CodeGenCUDA_Dev::PrintTempBufferCreation(const ir::Buffer &buffer) {
-  CHECK_NE(buffer->type(), Void());
+  PADDLE_ENFORCE_NE(
+      buffer->type(),
+      Void(),
+      phi::errors::InvalidArgument("buffer type should not be void"));
   // Calculate buffer size and determine if it contains a symbolic constant
   Expr buffer_size(1);
   for (int i = 0; i < buffer->shape.size(); i++) {
diff --git a/paddle/cinn/backends/codegen_cuda_host.cc b/paddle/cinn/backends/codegen_cuda_host.cc
index b888db7c7c726..1ba4714153395 100644
--- a/paddle/cinn/backends/codegen_cuda_host.cc
+++ b/paddle/cinn/backends/codegen_cuda_host.cc
@@ -23,7 +23,7 @@
 #include "paddle/cinn/backends/extern_func_jit_register.h"
 #include "paddle/cinn/backends/llvm/llvm_util.h"
 #include "paddle/cinn/runtime/intrinsic.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace backends {
 
@@ -65,10 +65,22 @@ llvm::Value* CodeGenCUDA_Host::LowerGPUKernelLauncher(
   llvm::Value* kernel_stream = nullptr;
   if (ll_function_args.size() == 3) {
     kernel_stream = ll_function_args[2];
-    CHECK_EQ(kernel_stream->getType(), ll_void_p_ty());  // void* stream
+    PADDLE_ENFORCE_EQ(
+        kernel_stream->getType(),
+        ll_void_p_ty(),
+        phi::errors::InvalidArgument(
+            "The type of kernel_stream should be void*"));  // void* stream
   }
-  CHECK_EQ(kernel_args->getType(), ll_void_p_ty());       // void* args
-  CHECK_EQ(kernel_args_count->getType(), ll_int32_ty());  // int32
+  PADDLE_ENFORCE_EQ(
+      kernel_args->getType(),
+      ll_void_p_ty(),
+      phi::errors::InvalidArgument(
+          "The type of kernel_args should be void*"));  // void* args
+  PADDLE_ENFORCE_EQ(
+      kernel_args_count->getType(),
+      ll_int32_ty(),
+      phi::errors::InvalidArgument(
+          "The type of kernel_args_count should be int32"));  // int32
 
   std::unordered_map<std::string, llvm::Value*> global_args = {
       {KERNEL_ARGS, kernel_args},
@@ -199,7 +211,11 @@ llvm::Value* CodeGenCUDA_Host::LowerHostFunc(const ir::_LoweredFunc_* func) {
   // @}
 
   // Set local scope table
-  CHECK_EQ(ll_function_args.size(), func->args.size());
+  PADDLE_ENFORCE_EQ(ll_function_args.size(),
+                    func->args.size(),
+                    phi::errors::InvalidArgument(
+                        "The number of arguments is not equal to the number of "
+                        "function arguments"));
   for (int i = 0; i < ll_function_args.size(); ++i) {
     SetVar(func->args[i].name(), ll_function_args[i]);
   }
@@ -224,7 +240,11 @@ llvm::Value* CodeGenCUDA_Host::LowerParseArgsValueCall(
     const ir::Call* call_ir) {
   auto ret_type = CinnTypeToLLVMType(Int(64), m_);
   std::vector<llvm::Type*> args_type;
-  CHECK_EQ(call_ir->read_args.size(), 2);
+  PADDLE_ENFORCE_EQ(
+      call_ir->read_args.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "The number of arguments of ParseArgsValue should be 2"));
   CHECK(call_ir->read_args[0].is_var() &&
         call_ir->read_args[0].as_var()->type().is_cpp_handle());
   CHECK(call_ir->read_args[1].type().is_int(32));
@@ -251,10 +271,22 @@ llvm::Value* CodeGenCUDA_Host::LowerCUDAKernelCall(const ir::Call* call_ir) {
   llvm::Value* kernel_stream = nullptr;
   if (ll_function_args.size() == 3) {
     kernel_stream = ll_function_args[2];
-    CHECK_EQ(kernel_stream->getType(), ll_void_p_ty());  // void* stream
+    PADDLE_ENFORCE_EQ(
+        kernel_stream->getType(),
+        ll_void_p_ty(),
+        phi::errors::InvalidArgument(
+            "The type of kernel_stream should be void*"));  // void* stream
   }
-  CHECK_EQ(kernel_args->getType(), ll_void_p_ty());       // void* args
-  CHECK_EQ(kernel_args_count->getType(), ll_int32_ty());  // int32
+  PADDLE_ENFORCE_EQ(
+      kernel_args->getType(),
+      ll_void_p_ty(),
+      phi::errors::InvalidArgument(
+          "The type of kernel_args should be void*"));  // void* args
+  PADDLE_ENFORCE_EQ(
+      kernel_args_count->getType(),
+      ll_int32_ty(),
+      phi::errors::InvalidArgument(
+          "The type of kernel_args_count should be int32"));  // int32
 
   std::unordered_map<std::string, llvm::Value*> global_args = {
       {KERNEL_ARGS, kernel_args},
diff --git a/paddle/cinn/backends/codegen_device_util.cc b/paddle/cinn/backends/codegen_device_util.cc
index 3373ed15e3bec..91c18ea35e9ea 100644
--- a/paddle/cinn/backends/codegen_device_util.cc
+++ b/paddle/cinn/backends/codegen_device_util.cc
@@ -68,6 +68,18 @@ std::string Predicate2String(ir::Expr predicate) {
   return ss.str();
 }
 
+static std::string CurTailFnName(const std::string &origin_fn_name) {
+  const int MaxStrLength = 16383;
+  if (origin_fn_name.length() <= MaxStrLength) {
+    return origin_fn_name;
+  }
+  VLOG(6) << "Funtion name too long. Curtail and concat hash.";
+  const std::string new_fn_name =
+      origin_fn_name.substr(0, MaxStrLength) +
+      std::to_string(std::hash<std::string>()(origin_fn_name));
+  return new_fn_name;
+}
+
 std::string
 detail::CollectBucketStrategyHostFunctionVisitor::GenDeviceKernelName(
     const std::string &fn_name, ir::Expr predicate) {
@@ -80,7 +92,10 @@ detail::CollectBucketStrategyHostFunctionVisitor::GenDeviceKernelName(
     pos = cond_str.find("-", pos + replacement.length());
   }
   VLOG(3) << "predicate string: " << cond_str;
-  return fn_name + "__COND_" + cond_str + "__kernel";
+  // NOTE(chenxi67): The kernel name is too long to be supported in cuda12.3 so
+  // we need to curtail it.
+  const std::string new_fn_name = CurTailFnName(fn_name);
+  return new_fn_name + "__COND_" + cond_str + "__kernel";
 }
 
 void detail::CollectBucketStrategyHostFunctionVisitor::ProcessLoweredFunc(
diff --git a/paddle/cinn/backends/codegen_device_util.h b/paddle/cinn/backends/codegen_device_util.h
index caada3153e63b..ff3114c71296b 100644
--- a/paddle/cinn/backends/codegen_device_util.h
+++ b/paddle/cinn/backends/codegen_device_util.h
@@ -27,7 +27,7 @@
 #include "paddle/cinn/ir/ir_mutator.h"
 #include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/runtime/flags.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace backends {
 
@@ -205,7 +205,11 @@ struct CollectBucketStrategyHostFunctionVisitor
     if (op->functions.size() == 1 && op->predicates.size() == 0) {
       expr->as_module()->predicates.push_back(ir::Expr(true));
     }
-    CHECK_EQ(op->functions.size(), op->predicates.size());
+    PADDLE_ENFORCE_EQ(
+        op->functions.size(),
+        op->predicates.size(),
+        phi::errors::InvalidArgument(
+            "The size of functions and predicates should be equal"));
     for (int i = 0; i < op->functions.size(); ++i) {
       ProcessLoweredFunc(op->functions[i], op->predicates[i]);
       if (i == 0) {
diff --git a/paddle/cinn/backends/compiler.cc b/paddle/cinn/backends/compiler.cc
index 4f02a35411413..72678eec44c22 100644
--- a/paddle/cinn/backends/compiler.cc
+++ b/paddle/cinn/backends/compiler.cc
@@ -230,15 +230,23 @@ void SourceCodePrint::write(const std::string& source_code) {
   }
 }
 
-void Compiler::Build(const Module& module, const std::string& code) {
-  auto PatternMatch =
-      adt::match{[&](common::UnknownArch) { CINN_NOT_IMPLEMENTED; },
-                 [&](common::X86Arch) { CompileX86Module(module); },
-                 [&](common::ARMArch) { CINN_NOT_IMPLEMENTED; },
-                 [&](common::NVGPUArch) { CompileCudaModule(module, code); }};
+void Compiler::Build(const Module& module,
+                     const std::string& code,
+                     const bool end) {
+  auto PatternMatch = adt::match{
+      [&](common::UnknownArch) { CINN_NOT_IMPLEMENTED; },
+      [&](common::X86Arch) { CompileX86Module(module, end); },
+      [&](common::ARMArch) { CINN_NOT_IMPLEMENTED; },
+      [&](common::NVGPUArch) { CompileCudaModule(module, code, end); }};
   return std::visit(PatternMatch, target_.arch.variant());
 }
 
+void Compiler::AppendCX86(const Module& module) {
+  VLOG(3) << "Start Compiler::BuildCX86" << module;
+  CompileX86Module(module, true);
+  VLOG(3) << "Over Compiler::BuildCX86";
+}
+
 std::string Compiler::GetSourceCode(const ir::Module& module) {
   return target_.arch.Visit(adt::match{
       [&](common::UnknownArch) -> std::string { CINN_NOT_IMPLEMENTED; },
@@ -287,7 +295,8 @@ std::string GetFileContent(const std::string& path) {
 }  // namespace
 
 void Compiler::CompileCudaModule(const Module& module,
-                                 const std::string& code) {
+                                 const std::string& code,
+                                 bool add_module) {
 #ifdef CINN_WITH_CUDA
   auto _host_module_device_module_ =
       SplitDeviceAndHostModule(module);  // NOLINT
@@ -337,15 +346,15 @@ void Compiler::CompileCudaModule(const Module& module,
   }
 
   engine_ = ExecutionEngine::Create(ExecutionOptions(), std::move(symbols));
-  engine_->Link<CodeGenCUDA_Host>(host_module);
+  engine_->Link<CodeGenCUDA_Host>(host_module, add_module);
 
 #else
   CINN_NOT_IMPLEMENTED
 #endif
 }
 
-void Compiler::CompileX86Module(const Module& module) {
-  engine_->Link<CodeGenX86>(module);
+void Compiler::CompileX86Module(const Module& module, bool add_module) {
+  engine_->Link<CodeGenX86>(module, add_module);
 }
 
 void Compiler::ExportObject(const std::string& path) {
diff --git a/paddle/cinn/backends/compiler.h b/paddle/cinn/backends/compiler.h
index f269b00492a42..d43455cf76287 100644
--- a/paddle/cinn/backends/compiler.h
+++ b/paddle/cinn/backends/compiler.h
@@ -107,7 +107,10 @@ class Compiler final {
   /**
    * Compile and link to a CINN module.
    */
-  void Build(const ir::Module& module, const std::string& code = "");
+  void Build(const ir::Module& module,
+             const std::string& code = "",
+             const bool end = true);
+  void AppendCX86(const ir::Module& module);
 
   void ExportObject(const std::string& path);
 
@@ -125,9 +128,10 @@ class Compiler final {
 
  private:
   void CompileCudaModule(const ir::Module& module,
-                         const std::string& code = "");
+                         const std::string& code = "",
+                         bool add_module = true);
 
-  void CompileX86Module(const ir::Module& module);
+  void CompileX86Module(const ir::Module& module, bool add_module = true);
 
   explicit Compiler(const Target& target)
       : target_(target), engine_(ExecutionEngine::Create(ExecutionOptions())) {}
diff --git a/paddle/cinn/backends/function_prototype.cc b/paddle/cinn/backends/function_prototype.cc
index e413521246b8f..e46b172bf65ed 100644
--- a/paddle/cinn/backends/function_prototype.cc
+++ b/paddle/cinn/backends/function_prototype.cc
@@ -20,7 +20,7 @@
 
 #include "paddle/cinn/ir/tensor.h"
 #include "paddle/cinn/runtime/flags.h"
-
+#include "paddle/common/enforce.h"
 PD_DECLARE_bool(verbose_function_register);
 
 namespace cinn {
@@ -42,13 +42,22 @@ bool FunctionProto::Match(const ir::Call *op) const {
 }
 
 void FunctionProto::AssertMatch(const ir::Call *op) const {
-  CHECK_EQ(name, op->name);
-  CHECK_EQ(ret_type, op->type())
-      << "function proto " << name << " check failed";
-  CHECK_EQ(op->read_args.size(), readonly_arg_types.size())
-      << "function proto " << name << " check failed";
-  CHECK_EQ(op->write_args.size(), mutable_arg_types.size())
-      << "function proto " << name << " check failed";
+  PADDLE_ENFORCE_EQ(
+      name,
+      op->name,
+      phi::errors::InvalidArgument("function proto's op name check failed"));
+  PADDLE_ENFORCE_EQ(
+      ret_type,
+      op->type(),
+      phi::errors::InvalidArgument("function proto's op type check failed"));
+  PADDLE_ENFORCE_EQ(op->read_args.size(),
+                    readonly_arg_types.size(),
+                    phi::errors::InvalidArgument(
+                        "function proto's readonly arg types check failed"));
+  PADDLE_ENFORCE_EQ(op->write_args.size(),
+                    mutable_arg_types.size(),
+                    phi::errors::InvalidArgument(
+                        "function proto's mutable arg types check failed"));
 
   auto get_type = [](Expr u) {
     if (u.as_tensor() || u.as_buffer()) {
@@ -61,14 +70,21 @@ void FunctionProto::AssertMatch(const ir::Call *op) const {
     if (readonly_arg_types[i] == type_of<cinn_buffer_t *>()) {
       if (!op->read_args[i].as_tensor()) continue;
     } else {
-      CHECK_EQ(get_type(op->read_args[i]), readonly_arg_types[i]);
+      PADDLE_ENFORCE_EQ(
+          get_type(op->read_args[i]),
+          readonly_arg_types[i],
+          phi::errors::InvalidArgument(
+              "function proto's readonly arg types check failed"));
     }
   }
   for (int i = 0; i < op->write_args.size(); i++) {
     if (mutable_arg_types[i] == type_of<cinn_buffer_t *>()) {
       if (!op->write_args[i].as_tensor()) continue;
     } else {
-      CHECK_EQ(get_type(op->write_args[i]), mutable_arg_types[i]);
+      PADDLE_ENFORCE_EQ(get_type(op->write_args[i]),
+                        mutable_arg_types[i],
+                        phi::errors::InvalidArgument(
+                            "function proto's mutable arg types check failed"));
     }
   }
 }
@@ -86,7 +102,10 @@ void FunctionProto::CheckValid() {
 
 FunctionProto::shape_inference_t FunctionProto::ShapeFollowNthArgument(int n) {
   return [=](const std::vector<Expr> &args, int value_offset) {
-    CHECK_LT(n, args.size());
+    PADDLE_ENFORCE_LT(
+        n,
+        args.size(),
+        phi::errors::InvalidArgument("The argument index is out of range"));
     auto x = args[n].as_tensor();
     CHECK(x);
     return x->shape;
diff --git a/paddle/cinn/backends/ir_schedule_test.cc b/paddle/cinn/backends/ir_schedule_test.cc
index 29eae201bbb78..a516b20c75804 100644
--- a/paddle/cinn/backends/ir_schedule_test.cc
+++ b/paddle/cinn/backends/ir_schedule_test.cc
@@ -31,7 +31,7 @@
 #include "paddle/cinn/optim/remove_schedule_block.h"
 #include "paddle/cinn/optim/unroll_loops.h"
 #include "paddle/cinn/optim/vectorize_loops.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace backends {
 
@@ -563,7 +563,10 @@ TEST(IrSchedule, vectorize) {
   ir::ModuleExpr mod_expr(vec_ast);
   ir::IRSchedule ir_sch(mod_expr);
   auto loops = ir_sch.GetLoops("B");
-  CHECK_EQ(loops.size(), 2U);
+  PADDLE_ENFORCE_EQ(
+      loops.size(),
+      2U,
+      phi::errors::InvalidArgument("The size of loops should be 2."));
   ir_sch.Vectorize(loops[1], 16);
   std::string origin = utils::GetStreamCnt(func[0]);
   EXPECT_EQ(origin, utils::Trim(R"ROC(
@@ -637,7 +640,10 @@ TEST(IrSchedule, unroll) {
   ir::ModuleExpr mod_expr(vec_ast);
   ir::IRSchedule ir_sch(mod_expr);
   auto loops = ir_sch.GetLoops("B");
-  CHECK_EQ(loops.size(), 2U);
+  PADDLE_ENFORCE_EQ(
+      loops.size(),
+      2U,
+      phi::errors::InvalidArgument("The size of loops should be 2."));
   ir_sch.Unroll(loops[1]);
   std::string origin = utils::GetStreamCnt(func[0]);
   EXPECT_EQ(origin, utils::Trim(R"ROC(
@@ -711,7 +717,10 @@ TEST(IrSchedule, bind) {
   ir::ModuleExpr mod_expr(vec_ast);
   ir::IRSchedule ir_sch(mod_expr);
   auto loops = ir_sch.GetLoops("B");
-  CHECK_EQ(loops.size(), 2U);
+  PADDLE_ENFORCE_EQ(
+      loops.size(),
+      2U,
+      phi::errors::InvalidArgument("The size of loops should be 2."));
   ir_sch.Bind(loops[0], "blockIdx.x");
   std::string origin = utils::GetStreamCnt(func[0]);
   EXPECT_EQ(origin, utils::Trim(R"ROC(
@@ -753,7 +762,10 @@ TEST(IrSchedule, simple_compute_at) {
 
   auto func = cinn::lang::LowerVec(
       "test_simple_compute_at", stages, {A, C}, {}, {}, nullptr, target, true);
-  CHECK_EQ(func.size(), 1U);
+  PADDLE_ENFORCE_EQ(
+      func.size(),
+      1U,
+      phi::errors::InvalidArgument("The size of func should be 1."));
 
   auto ast_expr = func[0]->body;
   std::vector<Expr> vec_ast{ast_expr};
@@ -826,7 +838,10 @@ TEST(IrSchedule, compute_at0) {
 
   auto func = cinn::lang::LowerVec(
       "test_compute_at0", stages, {A, C}, {}, {}, nullptr, target, true);
-  CHECK_EQ(func.size(), 1U);
+  PADDLE_ENFORCE_EQ(
+      func.size(),
+      1U,
+      phi::errors::InvalidArgument("The size of func should be 1."));
 
   auto ast_expr = func[0]->body;
   std::vector<Expr> vec_ast{ast_expr};
@@ -900,7 +915,10 @@ TEST(IrSchedule, compute_at1) {
 
   auto func = cinn::lang::LowerVec(
       "test_compute_at1", stages, {A, C}, {}, {}, nullptr, target, true);
-  CHECK_EQ(func.size(), 1U);
+  PADDLE_ENFORCE_EQ(
+      func.size(),
+      1U,
+      phi::errors::InvalidArgument("The size of func should be 1."));
 
   auto ast_expr = func[0]->body;
   std::vector<Expr> vec_ast{ast_expr};
@@ -972,7 +990,10 @@ TEST(IrSchedule, compute_at2) {
 
   auto func = cinn::lang::LowerVec(
       "test_compute_at2", stages, {A, C}, {}, {}, nullptr, target, true);
-  CHECK_EQ(func.size(), 1U);
+  PADDLE_ENFORCE_EQ(
+      func.size(),
+      1U,
+      phi::errors::InvalidArgument("The size of func should be 1."));
 
   auto ast_expr = func[0]->body;
   std::vector<Expr> vec_ast{ast_expr};
@@ -1044,7 +1065,10 @@ TEST(IrSchedule, compute_at3) {
 
   auto func = cinn::lang::LowerVec(
       "test_compute_at3", stages, {A, C}, {}, {}, nullptr, target, true);
-  CHECK_EQ(func.size(), 1U);
+  PADDLE_ENFORCE_EQ(
+      func.size(),
+      1U,
+      phi::errors::InvalidArgument("The size of func should be 1."));
 
   auto ast_expr = func[0]->body;
   std::vector<Expr> vec_ast{ast_expr};
@@ -1125,7 +1149,10 @@ TEST(IrSchedule, compute_at4) {
 
   auto func = cinn::lang::LowerVec(
       "test_compute_at4", stages, {A, C}, {}, {}, nullptr, target, true);
-  CHECK_EQ(func.size(), 1U);
+  PADDLE_ENFORCE_EQ(
+      func.size(),
+      1U,
+      phi::errors::InvalidArgument("The size of func should be 1."));
 
   auto ast_expr = func[0]->body;
   std::vector<Expr> vec_ast{ast_expr};
@@ -1187,7 +1214,10 @@ TEST(IrSchedule, compute_at5) {
 
   auto func = cinn::lang::LowerVec(
       "test_compute_at5", stages, {A, C}, {}, {}, nullptr, target, true);
-  CHECK_EQ(func.size(), 1U);
+  PADDLE_ENFORCE_EQ(
+      func.size(),
+      1U,
+      phi::errors::InvalidArgument("The size of func should be 1."));
 
   auto ast_expr = func[0]->body;
   std::vector<Expr> vec_ast{ast_expr};
@@ -1250,7 +1280,10 @@ TEST(IrSchedule, compute_at6) {
 
   auto func = cinn::lang::LowerVec(
       "test_compute_at6", stages, {A, C}, {}, {}, nullptr, target, true);
-  CHECK_EQ(func.size(), 1U);
+  PADDLE_ENFORCE_EQ(
+      func.size(),
+      1U,
+      phi::errors::InvalidArgument("The size of func should be 1."));
 
   auto ast_expr = func[0]->body;
   std::vector<Expr> vec_ast{ast_expr};
@@ -1316,7 +1349,10 @@ TEST(IrSchedule, cache_read1) {
   auto func = cinn::lang::LowerVec(
       "test_cache_read1", stages, {A, C}, {}, {}, nullptr, target, true);
 
-  CHECK_EQ(func.size(), 1U);
+  PADDLE_ENFORCE_EQ(
+      func.size(),
+      1U,
+      phi::errors::InvalidArgument("The size of func should be 1."));
 
   auto ast_expr = func[0]->body;
   std::vector<Expr> vec_ast{ast_expr};
@@ -1362,7 +1398,7 @@ void test_cache_read1(void* _args, int32_t num_args)
   };
   for (int32_t i = 0; i < 32; i += 1) {
     for (int32_t j = 0; j < 32; j += 1) {
-      B[((32 * i) + j)] = (2.00000000f * A_local_temp_buffer[((64 * i) + j)]);
+      B[((32 * i) + j)] = (A_local_temp_buffer[((64 * i) + j)] * 2.00000000f);
     };
   };
   for (int32_t cache_ax0_0 = 0; cache_ax0_0 < 16; cache_ax0_0 += 1) {
@@ -1372,7 +1408,7 @@ void test_cache_read1(void* _args, int32_t num_args)
   };
   for (int32_t i = 0; i < 16; i += 1) {
     for (int32_t j = 0; j < 16; j += 1) {
-      C[((16 * i) + j)] = (1.00000000f + B_local_temp_buffer[((32 * i) + j)]);
+      C[((16 * i) + j)] = (B_local_temp_buffer[((32 * i) + j)] + 1.00000000f);
     };
   };
   cinn_buffer_free((void*)(0), _B);
@@ -1399,7 +1435,10 @@ TEST(IrSchedule, cache_read2) {
   auto func = cinn::lang::LowerVec(
       "test_cache_read2", stages, {A, B}, {}, {}, nullptr, target, true);
 
-  CHECK_EQ(func.size(), 1U);
+  PADDLE_ENFORCE_EQ(
+      func.size(),
+      1U,
+      phi::errors::InvalidArgument("The size of func should be 1."));
 
   auto ast_expr = func[0]->body;
   std::vector<Expr> vec_ast{ast_expr};
@@ -1441,7 +1480,7 @@ void test_cache_read2(void* _args, int32_t num_args)
   for (int32_t i = 0; i < 64; i += 1) {
     for (int32_t j = 0; j < 32; j += 1) {
       A_local_temp_buffer[((32 * i) + j)] = A[((32 * i) + j)];
-      B[((32 * i) + j)] = (2.00000000f * A_local_temp_buffer[((32 * i) + j)]);
+      B[((32 * i) + j)] = (A_local_temp_buffer[((32 * i) + j)] * 2.00000000f);
     };
   };
   cinn_buffer_free((void*)(0), _B);
@@ -1469,7 +1508,10 @@ TEST(IrSchedule, cache_write1) {
   auto func = cinn::lang::LowerVec(
       "test_cache_write1", stages, {A, C}, {}, {}, nullptr, target, true);
 
-  CHECK_EQ(func.size(), 1U);
+  PADDLE_ENFORCE_EQ(
+      func.size(),
+      1U,
+      phi::errors::InvalidArgument("The size of func should be 1."));
 
   auto ast_expr = func[0]->body;
   std::vector<Expr> vec_ast{ast_expr};
@@ -1511,7 +1553,7 @@ void test_cache_write1(void* _args, int32_t num_args)
   float* C = ((float*)(_C->memory));
   for (int32_t i = 0; i < 64; i += 1) {
     for (int32_t j = 0; j < 32; j += 1) {
-      B_local_temp_buffer[((32 * i) + j)] = (2.00000000f * A[((32 * i) + j)]);
+      B_local_temp_buffer[((32 * i) + j)] = (A[((32 * i) + j)] * 2.00000000f);
     };
   };
   for (int32_t cache_ax0 = 0; cache_ax0 < 64; cache_ax0 += 1) {
@@ -1521,7 +1563,7 @@ void test_cache_write1(void* _args, int32_t num_args)
   };
   for (int32_t i = 0; i < 64; i += 1) {
     for (int32_t j = 0; j < 32; j += 1) {
-      C_local_temp_buffer[((32 * i) + j)] = (1.00000000f + B[((32 * i) + j)]);
+      C_local_temp_buffer[((32 * i) + j)] = (B[((32 * i) + j)] + 1.00000000f);
     };
   };
   for (int32_t cache_ax0_0 = 0; cache_ax0_0 < 64; cache_ax0_0 += 1) {
@@ -1553,7 +1595,10 @@ TEST(IrSchedule, cache_write2) {
   auto func = cinn::lang::LowerVec(
       "test_cache_write2", stages, {A, B}, {}, {}, nullptr, target, true);
 
-  CHECK_EQ(func.size(), 1U);
+  PADDLE_ENFORCE_EQ(
+      func.size(),
+      1U,
+      phi::errors::InvalidArgument("The size of func should be 1."));
 
   auto ast_expr = func[0]->body;
   std::vector<Expr> vec_ast{ast_expr};
@@ -1592,7 +1637,7 @@ void test_cache_write2(void* _args, int32_t num_args)
   float* B = ((float*)(_B->memory));
   for (int32_t cache_ax0 = 0; cache_ax0 < 64; cache_ax0 += 1) {
     for (int32_t cache_ax1 = 0; cache_ax1 < 32; cache_ax1 += 1) {
-      B_local_temp_buffer[((32 * cache_ax0) + cache_ax1)] = (2.00000000f * A[((32 * cache_ax0) + cache_ax1)]);
+      B_local_temp_buffer[((32 * cache_ax0) + cache_ax1)] = (A[((32 * cache_ax0) + cache_ax1)] * 2.00000000f);
       B[((32 * cache_ax0) + cache_ax1)] = B_local_temp_buffer[((32 * cache_ax0) + cache_ax1)];
     };
   };
@@ -1624,7 +1669,10 @@ TEST(IrSchedule, cache_read3) {
   auto func = cinn::lang::LowerVec(
       "test_cache_read3", stages, {A, C}, {}, {}, nullptr, target, true);
 
-  CHECK_EQ(func.size(), 1U);
+  PADDLE_ENFORCE_EQ(
+      func.size(),
+      1U,
+      phi::errors::InvalidArgument("The size of func should be 1."));
 
   auto ast_expr = func[0]->body;
   std::vector<Expr> vec_ast{ast_expr};
@@ -1665,7 +1713,7 @@ void test_cache_read3(const float* __restrict__ A, float* __restrict__ C)
   };
   for (int32_t i = 0; i < 32; i += 1) {
     for (int32_t j = 0; j < 32; j += 1) {
-      B[((32 * i) + j)] = (2.00000000f * A_local_temp_buffer[((64 * i) + j)]);
+      B[((32 * i) + j)] = (A_local_temp_buffer[((64 * i) + j)] * 2.00000000f);
     };
     __syncthreads();
   };
@@ -1677,7 +1725,7 @@ void test_cache_read3(const float* __restrict__ A, float* __restrict__ C)
   for (int32_t i = 0; i < 16; i += 1) {
     __syncthreads();
     for (int32_t j = 0; j < 16; j += 1) {
-      C[((16 * i) + j)] = (1.00000000f + B_local_temp_buffer[((32 * i) + j)]);
+      C[((16 * i) + j)] = (B_local_temp_buffer[((32 * i) + j)] + 1.00000000f);
     };
   };
 }
@@ -1705,7 +1753,10 @@ TEST(IrSchedule, cache_write3) {
   auto func = cinn::lang::LowerVec(
       "test_cache_write3", stages, {A, C}, {}, {}, nullptr, target, true);
 
-  CHECK_EQ(func.size(), 1U);
+  PADDLE_ENFORCE_EQ(
+      func.size(),
+      1U,
+      phi::errors::InvalidArgument("The size of func should be 1."));
 
   auto ast_expr = func[0]->body;
   std::vector<Expr> vec_ast{ast_expr};
@@ -1743,7 +1794,7 @@ void test_cache_write3(const float* __restrict__ A, float* __restrict__ C)
   float* B = _B_temp_buffer;
   for (int32_t i = 0; i < 64; i += 1) {
     for (int32_t j = 0; j < 32; j += 1) {
-      B_local_temp_buffer[((32 * i) + j)] = (2.00000000f * A[((32 * i) + j)]);
+      B_local_temp_buffer[((32 * i) + j)] = (A[((32 * i) + j)] * 2.00000000f);
     };
   };
   for (int32_t cache_ax0 = 0; cache_ax0 < 64; cache_ax0 += 1) {
@@ -1754,7 +1805,7 @@ void test_cache_write3(const float* __restrict__ A, float* __restrict__ C)
   __syncthreads();
   for (int32_t i = 0; i < 64; i += 1) {
     for (int32_t j = 0; j < 32; j += 1) {
-      C_local_temp_buffer[((32 * i) + j)] = (1.00000000f + B[((32 * i) + j)]);
+      C_local_temp_buffer[((32 * i) + j)] = (B[((32 * i) + j)] + 1.00000000f);
     };
   };
   __syncthreads();
@@ -1788,7 +1839,10 @@ TEST(IrSchedule, sync_threads) {
   auto func = cinn::lang::LowerVec(
       "test_sync_threads", stages, {A, C}, {}, {}, nullptr, target, true);
 
-  CHECK_EQ(func.size(), 1U);
+  PADDLE_ENFORCE_EQ(
+      func.size(),
+      1U,
+      phi::errors::InvalidArgument("The size of func should be 1."));
 
   auto ast_expr = func[0]->body;
   std::vector<Expr> vec_ast{ast_expr};
@@ -1824,7 +1878,7 @@ void test_sync_threads(const float* __restrict__ A, float* __restrict__ C)
   float* B = _B_temp_buffer;
   for (int32_t i = 0; i < 64; i += 1) {
     for (int32_t j = 0; j < 32; j += 1) {
-      B_local_temp_buffer[((32 * i) + j)] = (2.00000000f * A[((32 * i) + j)]);
+      B_local_temp_buffer[((32 * i) + j)] = (A[((32 * i) + j)] * 2.00000000f);
     };
   };
   for (int32_t cache_ax0 = 0; cache_ax0 < 64; cache_ax0 += 1) {
@@ -1835,7 +1889,7 @@ void test_sync_threads(const float* __restrict__ A, float* __restrict__ C)
   };
   for (int32_t i = 0; i < 64; i += 1) {
     for (int32_t j = 0; j < 32; j += 1) {
-      C_local_temp_buffer[((32 * i) + j)] = (1.00000000f + B[((32 * i) + j)]);
+      C_local_temp_buffer[((32 * i) + j)] = (B[((32 * i) + j)] + 1.00000000f);
     };
   };
   for (int32_t cache_ax0_0 = 0; cache_ax0_0 < 64; cache_ax0_0 += 1) {
@@ -1870,7 +1924,10 @@ TEST(IrSchedule, cache_write4) {
   auto func = cinn::lang::LowerVec(
       "test_cache_write4", stages, {A, B}, {}, {}, nullptr, target, true);
 
-  CHECK_EQ(func.size(), 1U);
+  PADDLE_ENFORCE_EQ(
+      func.size(),
+      1U,
+      phi::errors::InvalidArgument("The size of func should be 1."));
 
   auto ast_expr = func[0]->body;
   std::vector<Expr> vec_ast{ast_expr};
@@ -1953,7 +2010,10 @@ TEST(IrSchedule, rfactor) {
   ir::ModuleExpr mod_expr(vec_ast);
   ir::IRSchedule ir_sch(mod_expr);
   auto loops = ir_sch.GetLoops("B");
-  CHECK_EQ(loops.size(), 3U);
+  PADDLE_ENFORCE_EQ(
+      loops.size(),
+      3U,
+      phi::errors::InvalidArgument("The size of loops should be 3."));
   auto new_rf_tensor = ir_sch.Rfactor(loops[2], 0);
   auto* new_rf_tensor_ref = new_rf_tensor.As<ir::_Tensor_>();
   CHECK(new_rf_tensor_ref);
@@ -2080,7 +2140,10 @@ TEST(IrSchedule, rfactor1) {
   ir::ModuleExpr mod_expr(vec_ast);
   ir::IRSchedule ir_sch(mod_expr);
   auto loops = ir_sch.GetLoops("B");
-  CHECK_EQ(loops.size(), 3U);
+  PADDLE_ENFORCE_EQ(
+      loops.size(),
+      3U,
+      phi::errors::InvalidArgument("The size of loops should be 3."));
   auto new_rf_tensor = ir_sch.Rfactor(loops[1], 1);
   auto* new_rf_tensor_ref = new_rf_tensor.As<ir::_Tensor_>();
   CHECK(new_rf_tensor_ref);
@@ -2206,7 +2269,10 @@ TEST(IrSchedule, rfactor2) {
   ir::ModuleExpr mod_expr(vec_ast);
   ir::IRSchedule ir_sch(mod_expr);
   auto loops = ir_sch.GetLoops("C");
-  CHECK_EQ(loops.size(), 3U);
+  PADDLE_ENFORCE_EQ(
+      loops.size(),
+      3U,
+      phi::errors::InvalidArgument("The size of loops should be 3."));
   auto new_rf_tensor = ir_sch.Rfactor(loops[2], 0);
   auto* new_rf_tensor_ref = new_rf_tensor.As<ir::_Tensor_>();
   CHECK(new_rf_tensor_ref);
@@ -2347,7 +2413,10 @@ TEST(IrSchedule, factorize_reduction) {
   ir::ModuleExpr mod_expr(vec_ast);
   ir::IRSchedule ir_sch(mod_expr);
   auto loops = ir_sch.GetLoops("B");
-  CHECK_EQ(loops.size(), 3U);
+  PADDLE_ENFORCE_EQ(
+      loops.size(),
+      3U,
+      phi::errors::InvalidArgument("The size of loops should be 3."));
   auto new_rf_tensor = ir_sch.FactorizeReduction(loops[1], 0);
   auto* new_rf_tensor_ref = new_rf_tensor.As<ir::_Tensor_>();
   CHECK(new_rf_tensor_ref);
@@ -2436,7 +2505,10 @@ TEST(IrSchedule, factorize_reduction1) {
   ir::ModuleExpr mod_expr(vec_ast);
   ir::IRSchedule ir_sch(mod_expr);
   auto loops = ir_sch.GetLoops("B");
-  CHECK_EQ(loops.size(), 3U);
+  PADDLE_ENFORCE_EQ(
+      loops.size(),
+      3U,
+      phi::errors::InvalidArgument("The size of loops should be 3."));
   auto new_rf_tensor = ir_sch.FactorizeReduction(loops[1], 1);
   auto* new_rf_tensor_ref = new_rf_tensor.As<ir::_Tensor_>();
   CHECK(new_rf_tensor_ref);
@@ -2520,9 +2592,15 @@ TEST(IrSchedule, factorize_reduction2) {
   ir::ModuleExpr mod_expr(vec_ast);
   ir::IRSchedule ir_sch(mod_expr);
   auto loops = ir_sch.GetLoops("B");
-  CHECK_EQ(loops.size(), 2U);
+  PADDLE_ENFORCE_EQ(
+      loops.size(),
+      2U,
+      phi::errors::InvalidArgument("The size of loops should be 2."));
   auto splited_loops = ir_sch.Split(loops[1], {4, 5});
-  CHECK_EQ(splited_loops.size(), 2U);
+  PADDLE_ENFORCE_EQ(
+      splited_loops.size(),
+      2U,
+      phi::errors::InvalidArgument("The size of splited_loops should be 2."));
   auto new_rf_tensor = ir_sch.FactorizeReduction(splited_loops[0], 1);
   auto* new_rf_tensor_ref = new_rf_tensor.As<ir::_Tensor_>();
   CHECK(new_rf_tensor_ref);
@@ -2638,7 +2716,7 @@ void test_compute_inline1(void* _args, int32_t num_args)
   for (int32_t i = 0; i < 32; i += 1) {
     for (int32_t j = 0; j < 32; j += 1) {
       for (int32_t k = 0; k < 32; k += 1) {
-        C[((1024 * i) + ((32 * j) + k))] = fma(2.00000000f, A[((32 * i) + ((1024 * j) + k))], 2.00000000f);
+        C[((1024 * i) + ((32 * j) + k))] = ((A[((32 * i) + ((1024 * j) + k))] + 1.00000000f) * 2.00000000f);
       };
     };
   };
@@ -2712,7 +2790,7 @@ void test_compute_inline2(void* _args, int32_t num_args)
   for (int32_t i = 0; i < 32; i += 1) {
     for (int32_t j = 0; j < 32; j += 1) {
       for (int32_t k = 0; k < 32; k += 1) {
-        C[((1024 * i) + ((32 * j) + k))] = fma(2.00000000f, A[((1024 * i) + ((32 * j) + k))], 2.00000000f);
+        C[((1024 * i) + ((32 * j) + k))] = ((A[((1024 * i) + ((32 * j) + k))] + 1.00000000f) * 2.00000000f);
       };
     };
   };
@@ -2777,7 +2855,7 @@ void test_compute_inline3(const float* __restrict__ A, float* __restrict__ C)
   for (int32_t i = 0; i < 32; i += 1) {
     for (int32_t j = 0; j < 32; j += 1) {
       for (int32_t k = 0; k < 32; k += 1) {
-        C[((1024 * i) + ((32 * j) + k))] = (2.00000000f + (2.00000000f * A[((32 * i) + ((1024 * j) + k))]));
+        C[((1024 * i) + ((32 * j) + k))] = ((A[((32 * i) + ((1024 * j) + k))] + 1.00000000f) * 2.00000000f);
       };
     };
   };
@@ -2839,7 +2917,7 @@ void test_compute_inline4(const float* __restrict__ A, float* __restrict__ C)
   for (int32_t i = 0; i < 32; i += 1) {
     for (int32_t j = 0; j < 32; j += 1) {
       for (int32_t k = 0; k < 32; k += 1) {
-        C[((1024 * i) + ((32 * j) + k))] = (2.00000000f + (2.00000000f * A[((1024 * i) + ((32 * j) + k))]));
+        C[((1024 * i) + ((32 * j) + k))] = ((A[((1024 * i) + ((32 * j) + k))] + 1.00000000f) * 2.00000000f);
       };
     };
   };
@@ -2901,7 +2979,7 @@ void test_compute_inline1(void* _args, int32_t num_args)
   float* C = ((float*)(_C->memory));
   for (int32_t i = 0; i < 32; i += 1) {
     for (int32_t j = 0; j < 64; j += 1) {
-      C[((32 * j) + i)] = fma(2.00000000f, A[((64 * i) + j)], 2.00000000f);
+      C[((32 * j) + i)] = (2.00000000f * (1.00000000f + A[((64 * i) + j)]));
     };
   };
   cinn_buffer_free((void*)(0), _B);
@@ -2969,7 +3047,7 @@ void test_compute_inline1(void* _args, int32_t num_args)
   for (int32_t i = 0; i < 32; i += 1) {
     for (int32_t j = 0; j < 32; j += 1) {
       for (int32_t k = 0; k < 32; k += 1) {
-        C[((32 * i) + ((1024 * j) + k))] = fma(2.00000000f, A[((1024 * i) + ((32 * j) + k))], 2.00000000f);
+        C[((32 * i) + ((1024 * j) + k))] = (2.00000000f * (1.00000000f + A[((1024 * i) + ((32 * j) + k))]));
       };
     };
   };
@@ -3047,7 +3125,7 @@ void test_copytransform1(void* _args, int32_t num_args)
       for (int32_t j = 0; j < 8; j += 1) {
         for (int32_t j_0 = 0; j_0 < 4; j_0 += 1) {
           for (int32_t k = 0; k < 32; k += 1) {
-            B[((8192 * i) + ((1024 * i_0) + ((128 * j) + ((32 * j_0) + k))))] = (1.00000000f + A[((8192 * i) + ((1024 * i_0) + ((128 * j) + ((32 * j_0) + k))))]);
+            B[((8192 * i) + ((1024 * i_0) + ((128 * j) + ((32 * j_0) + k))))] = (A[((8192 * i) + ((1024 * i_0) + ((128 * j) + ((32 * j_0) + k))))] + 1.00000000f);
           };
         };
       };
@@ -3058,7 +3136,7 @@ void test_copytransform1(void* _args, int32_t num_args)
       for (int32_t j = 0; j < 8; j += 1) {
         for (int32_t j_0 = 0; j_0 < 4; j_0 += 1) {
           for (int32_t k = 0; k < 32; k += 1) {
-            C[((8192 * i) + ((1024 * i_0) + ((128 * j) + ((32 * j_0) + k))))] = (2.00000000f * B[((256 * i) + ((32 * i_0) + ((4096 * j) + ((1024 * j_0) + k))))]);
+            C[((8192 * i) + ((1024 * i_0) + ((128 * j) + ((32 * j_0) + k))))] = (B[((256 * i) + ((32 * i_0) + ((4096 * j) + ((1024 * j_0) + k))))] * 2.00000000f);
           };
         };
       };
@@ -3136,7 +3214,7 @@ void test_copytransform2(void* _args, int32_t num_args)
     for (int32_t i_0 = 0; i_0 < 8; i_0 += 1) {
       for (int32_t j = 0; j < 64; j += 1) {
         for (int32_t k = 0; k < 128; k += 1) {
-          B[((65536 * i) + ((8192 * i_0) + ((128 * j) + k)))] = (1.00000000f + A[((65536 * i) + ((8192 * i_0) + ((128 * j) + k)))]);
+          B[((65536 * i) + ((8192 * i_0) + ((128 * j) + k)))] = (A[((65536 * i) + ((8192 * i_0) + ((128 * j) + k)))] + 1.00000000f);
         };
       };
     };
@@ -3146,7 +3224,7 @@ void test_copytransform2(void* _args, int32_t num_args)
       for (int32_t j = 0; j < 8; j += 1) {
         for (int32_t j_0 = 0; j_0 < 4; j_0 += 1) {
           for (int32_t k = 0; k < 128; k += 1) {
-            C[((32768 * i) + ((4096 * i_0) + ((512 * j) + ((128 * j_0) + k))))] = (2.00000000f * B[((65536 * i) + ((8192 * i_0) + ((512 * j) + ((128 * j_0) + k))))]);
+            C[((32768 * i) + ((4096 * i_0) + ((512 * j) + ((128 * j_0) + k))))] = (B[((65536 * i) + ((8192 * i_0) + ((512 * j) + ((128 * j_0) + k))))] * 2.00000000f);
           };
         };
       };
@@ -3278,13 +3356,19 @@ TEST(IrSchedule, ComplexIndices) {
   VLOG(3) << "Lowered Expr:" << ir_sch.GetModule().GetExprs().front();
 
   auto loops_b = ir_sch.GetLoops("B");
-  CHECK_EQ(loops_b.size(), 2);
+  PADDLE_ENFORCE_EQ(
+      loops_b.size(),
+      2,
+      phi::errors::InvalidArgument("The loops size of B should be 2."));
   ir_sch.Split("B", 0, {8, -1});
   ir_sch.Split(
       "B", 2, {32, -1});  // after first splited, loops size has added to 3
   VLOG(3) << "Splited Expr:" << ir_sch.GetModule().GetExprs().front();
 
-  CHECK_EQ(ir_sch.GetLoops("B").size(), 4);
+  PADDLE_ENFORCE_EQ(ir_sch.GetLoops("B").size(),
+                    4,
+                    phi::errors::InvalidArgument(
+                        "The loops size of B should be 4 after split."));
   ir_sch.Reorder("B", {2, 0, 3, 1});
   VLOG(3) << "Reordered Expr:\n" << ir_sch.GetModule().GetExprs().front();
 
diff --git a/paddle/cinn/backends/llvm/codegen_llvm.cc b/paddle/cinn/backends/llvm/codegen_llvm.cc
index 2f8a387045bf6..d7889ebb9fc15 100644
--- a/paddle/cinn/backends/llvm/codegen_llvm.cc
+++ b/paddle/cinn/backends/llvm/codegen_llvm.cc
@@ -24,7 +24,6 @@
 #include <llvm/IR/Metadata.h>
 #include <llvm/Support/TargetSelect.h>
 #include <llvm/Support/raw_ostream.h>
-
 #include <algorithm>
 #include <functional>
 #include <iostream>
@@ -32,6 +31,7 @@
 #include <sstream>
 #include <string>
 #include <type_traits>
+#include "paddle/common/enforce.h"
 
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
@@ -205,12 +205,12 @@ llvm::Value *CodeGenLLVM::EmitBinaryOp(llvm::Value *lhs,
                                        bool is_integral,
                                        bool is_signed) {
   llvm::Instruction::BinaryOps ops;
-  CHECK_EQ(lhs->getType(), rhs->getType())
-      << "the types of operands of binary operation are mismatch"
-      << ", lhs[" << DumpToString(*lhs) << "] " << opcode << " rhs["
-      << DumpToString(*rhs) << "]"
-      << ", lhs_type[" << DumpToString(*lhs->getType()) << "], rhs_type["
-      << DumpToString(*rhs->getType()) << "]";
+  PADDLE_ENFORCE_EQ(
+      lhs->getType(),
+      rhs->getType(),
+      phi::errors::InvalidArgument(
+          "the types of operands of binary operation are mismatch"));
+
   switch (opcode) {
     case '+':
       ops = is_integral ? llvm::Instruction::BinaryOps::Add
@@ -288,6 +288,7 @@ llvm::Value *CodeGenLLVM::Visit(const ir::Sub *op) {
 }
 
 llvm::Value *CodeGenLLVM::Visit(const ir::Mul *op) {
+  ir::TryElevateInt32ToInt64({op->a(), op->b()});
   auto *lhs = Visit(&op->a());
   auto *rhs = Visit(&op->b());
   return EmitBinaryOp(lhs, rhs, '*', is_integral_type(op->type()));
@@ -591,8 +592,8 @@ llvm::Value *CodeGenLLVM::CreateSerialFor(const ir::For *op, int stride) {
 
   llvm::Value *old_var = GetVar(op->loop_var->name);
   // loop iterator
-  llvm::AllocaInst *loop_var =
-      Alloca(b_->getInt32Ty(), nullptr, op->loop_var->name);
+  llvm::AllocaInst *loop_var = Alloca(
+      b_->getIntNTy(op->min->type().bits()), nullptr, op->loop_var->name);
   loop_var->setAlignment(llvm::Align(4));
   SetVar(op->loop_var->name, loop_var);
 
@@ -613,7 +614,8 @@ llvm::Value *CodeGenLLVM::CreateSerialFor(const ir::For *op, int stride) {
 
   // loop_body
   b_->SetInsertPoint(body_bb);
-  llvm::Value *step = llvm::ConstantInt::get(b_->getInt32Ty(), stride);
+  llvm::Value *step =
+      llvm::ConstantInt::get(b_->getIntNTy(op->min->type().bits()), stride);
 
   Visit(&op->body);
   llvm::Value *indvar_inc = Add(indvar,
@@ -880,7 +882,10 @@ llvm::Value *CodeGenLLVM::Visit(const ir::Load *op) {
     {
       int alignment = op->type().bits();
       alignment = 8;
-      CHECK_GT(alignment, 0);
+      PADDLE_ENFORCE_GT(
+          alignment,
+          0,
+          phi::errors::InvalidArgument("alignment should be greater than 0"));
       load_inst->setAlignment(llvm::Align(std::min(alignment, 8)));
     }
 
@@ -949,7 +954,10 @@ llvm::Value *CodeGenLLVM::Visit(const ir::Store *op) {
     {
       int alignment = op->type().bits();
       alignment = 8;
-      CHECK_GT(alignment, 0);
+      PADDLE_ENFORCE_GT(
+          alignment,
+          0,
+          phi::errors::InvalidArgument("alignment should be greater than 0"));
       store_inst->setAlignment(llvm::Align(std::min(alignment, 8)));
     }
     // TODO(fc500110): tbaa AliasAnalysis
@@ -1059,9 +1067,12 @@ llvm::Value *CodeGenLLVM::Visit(const ir::_LoweredFunc_ *op) {
   auto init_function_state = [this]() { alias_vars_.clear(); };
   init_function_state();
 
-  CHECK_EQ(op->alloc_output_buffer_exprs.size(),
-           op->dealloc_output_buffer_exprs.size())
-      << "the count of allocation and deallocation expressions is not match";
+  PADDLE_ENFORCE_EQ(
+      op->alloc_output_buffer_exprs.size(),
+      op->dealloc_output_buffer_exprs.size(),
+      phi::errors::InvalidArgument(
+          "the count of allocation and deallocation expressions is not "
+          "match"));
 
   std::vector<Expr> new_body;
   auto create_temp_buffers = op->PrepareCreateTempBufferExprs();
@@ -1228,7 +1239,11 @@ llvm::Value *CodeGenLLVM::EmitCall_get_address(const ir::Call *op) {
 
 llvm::Value *CodeGenLLVM::EmitCall_debug_info(const ir::Call *op) {
   auto callee = m_->getFunction(runtime::intrinsic::debug_log_repr);
-  CHECK_GE(op->read_args.size(), 1UL);
+  PADDLE_ENFORCE_GE(op->read_args.size(),
+                    1UL,
+                    phi::errors::InvalidArgument(
+                        "The arguments of debug_log_repr should be greater "
+                        "than 1"));
   std::vector<llvm::Value *> args;
   for (auto &arg : op->read_args) {
     args.push_back(Visit(&arg));
@@ -1315,7 +1330,9 @@ llvm::Value *CodeGenLLVM::DenseVectorLoad(const ir::Load *op) {
     slices.push_back(load_inst);
   }
 
-  CHECK_EQ(slices.size(), 1UL);
+  PADDLE_ENFORCE_EQ(slices.size(),
+                    1UL,
+                    phi::errors::InvalidArgument("slices size should be 1."));
 
   return slices[0];
 }
@@ -1323,7 +1340,11 @@ llvm::Value *CodeGenLLVM::DenseVectorLoad(const ir::Load *op) {
 llvm::Value *CodeGenLLVM::CreateBufferVecPtr(Type t,
                                              llvm::Value *buffer,
                                              llvm::Value *index) {
-  CHECK_GT(t.lanes(), 1) << "type is not a vector type: " << t;
+  PADDLE_ENFORCE_GT(t.lanes(),
+                    1,
+                    phi::errors::InvalidArgument("type lanes should be greater "
+                                                 "than 1, but received %d",
+                                                 t.lanes()));
   llvm::PointerType *btype =
       llvm::dyn_cast<llvm::PointerType>(buffer->getType());
   CHECK(btype);
@@ -1338,7 +1359,11 @@ llvm::Value *CodeGenLLVM::CreateBufferVecPtr(Type t,
 llvm::Value *CodeGenLLVM::CreateBufferPtr(Type t,
                                           llvm::Value *buffer,
                                           llvm::Value *index) {
-  CHECK_EQ(t.lanes(), 1);
+  PADDLE_ENFORCE_EQ(t.lanes(),
+                    1,
+                    phi::errors::InvalidArgument("type lanes should be 1, but "
+                                                 "received %d",
+                                                 t.lanes()));
   auto *btype = llvm::dyn_cast<llvm::PointerType>(buffer->getType());
   CHECK(btype);
   auto *ptype =
@@ -1355,7 +1380,10 @@ llvm::Value *CodeGenLLVM::CreateVecSlice(llvm::Value *vec,
                                          int lanes) {
   int total_lanes =
       llvm::dyn_cast<llvm::VectorType>(vec->getType())->getNumElements();
-  CHECK_LE(begin + lanes, total_lanes);
+  PADDLE_ENFORCE_LE(begin + lanes,
+                    total_lanes,
+                    phi::errors::InvalidArgument(
+                        "begin + lanes should be less than total_lanes"));
   if (lanes == total_lanes && begin == 0) return vec;  // full slice
   std::vector<llvm::Constant *> indices;
   for (int i = 0; i < lanes; ++i) {
@@ -1422,7 +1450,10 @@ void CodeGenLLVM::AddTbaaMetadata(llvm::Instruction *inst,
       if (pstride_int && pbase_int) {
         int stride = pstride_int->value;
         base = pbase_int->value;
-        CHECK_GE(base, 0);
+        PADDLE_ENFORCE_GE(
+            base,
+            0,
+            phi::errors::InvalidArgument("base should be greater than 0"));
         width = NextPowerOfTwo(ramp->lanes * stride);
 
         while (base % width) {
@@ -1491,12 +1522,15 @@ llvm::Value *CodeGenLLVM::Visit(const ir::intrinsics::BufferCreate *op) {
   CHECK(buffer_node);
   std::vector<llvm::Value *> args(
       {ll_const_int32(buffer_node->target.runtime_arch())});
-  uint64_t memory_size = (buffer_node->dtype.ElementOf().bits() + 7) / 8;
-  for (auto shape : buffer_node->shape) {
-    int shape_int = shape.as_int32();
-    memory_size *= shape_int;
+  int64_t memory_size = (buffer_node->dtype.ElementOf().bits() + 7) / 8;
+  // Calculate buffer size and determine if it contains a symbolic constant
+  Expr buffer_size(static_cast<int64_t>(1));
+  buffer_size = buffer_size * ir::Expr(memory_size);
+  for (int i = 0; i < buffer_node->shape.size(); i++) {
+    buffer_size = buffer_size * buffer_node->shape[i];
   }
-  args.push_back(ll_const_int64(memory_size));
+  ir::TryElevateInt32ToInt64({buffer_size});
+  args.push_back(Visit(&buffer_size));
   args.push_back(ll_const_int32(32));
 
   return Call(callee, args);
@@ -1596,29 +1630,50 @@ llvm::Value *CodeGenLLVM::Visit(const ir::intrinsics::BuiltinIntrin *op) {
   std::string func_name = op->name;
   if (op->id == -1) {
     if (func_name == "bitwise_and") {
-      CHECK_GE(op->args.size(), 2U);
+      PADDLE_ENFORCE_GE(op->args.size(),
+                        2U,
+                        phi::errors::InvalidArgument(
+                            "bitwise_and should have at least 2 arguments"));
       return b_->CreateAnd(Visit(&op->args[0]), Visit(&op->args[1]));
     } else if (func_name == "bitwise_or") {
-      CHECK_GE(op->args.size(), 2U);
+      PADDLE_ENFORCE_GE(op->args.size(),
+                        2U,
+                        phi::errors::InvalidArgument(
+                            "bitwise_or should have at least 2 arguments"));
       return b_->CreateOr(Visit(&op->args[0]), Visit(&op->args[1]));
     } else if (func_name == "bitwise_xor") {
-      CHECK_GE(op->args.size(), 2U);
+      PADDLE_ENFORCE_GE(op->args.size(),
+                        2U,
+                        phi::errors::InvalidArgument(
+                            "bitwise_xor should have at least 2 arguments"));
       return b_->CreateXor(Visit(&op->args[0]), Visit(&op->args[1]));
     } else if (func_name == "bitwise_not") {
-      CHECK_GE(op->args.size(), 1U);
+      PADDLE_ENFORCE_GE(op->args.size(),
+                        1U,
+                        phi::errors::InvalidArgument(
+                            "bitwise_not should have at least 1 argument"));
       return b_->CreateNot(Visit(&op->args[0]));
     } else if (func_name == "left_shift") {
-      CHECK_GE(op->args.size(), 2U);
+      PADDLE_ENFORCE_GE(op->args.size(),
+                        2U,
+                        phi::errors::InvalidArgument(
+                            "left_shift should have at least 2 arguments"));
       return b_->CreateShl(Visit(&op->args[0]), Visit(&op->args[1]));
     } else if (func_name == "right_shift") {
-      CHECK_GE(op->args.size(), 2U);
+      PADDLE_ENFORCE_GE(op->args.size(),
+                        2U,
+                        phi::errors::InvalidArgument(
+                            "right_shift should have at least 2 arguments"));
       if (op->args[0]->type().is_int()) {
         return b_->CreateAShr(Visit(&op->args[0]), Visit(&op->args[1]));
       } else {
         return b_->CreateLShr(Visit(&op->args[0]), Visit(&op->args[1]));
       }
     } else if (func_name == "isnan") {
-      CHECK_GE(op->args.size(), 1U);
+      PADDLE_ENFORCE_GE(op->args.size(),
+                        1U,
+                        phi::errors::InvalidArgument(
+                            "isnan should have at least 1 argument"));
       llvm::Value *v = Visit(&op->args[0]);
       return b_->CreateFCmpUNO(v, v);
     }
diff --git a/paddle/cinn/backends/llvm/codegen_llvm_test.cc b/paddle/cinn/backends/llvm/codegen_llvm_test.cc
index 930e70f22e869..074e960aba678 100644
--- a/paddle/cinn/backends/llvm/codegen_llvm_test.cc
+++ b/paddle/cinn/backends/llvm/codegen_llvm_test.cc
@@ -21,12 +21,12 @@
 #include <llvm/Support/MemoryBuffer.h>
 #include <llvm/Support/SourceMgr.h>
 #include <llvm/Support/raw_ostream.h>
-
 #include <algorithm>
 #include <iomanip>
 #include <memory>
 #include <utility>
 #include <vector>
+#include "paddle/common/enforce.h"
 
 #include "paddle/cinn/backends/llvm/cinn_runtime_llvm_ir.h"
 #include "paddle/cinn/cinn.h"
@@ -96,7 +96,10 @@ auto CreateIrBuffer(cinn::common::Type t,
                     std::string name,
                     std::vector<int> shape,
                     int data_alignment = 0) {
-  CHECK_GE(data_alignment, 0);
+  PADDLE_ENFORCE_GE(data_alignment,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "data_alignment should be greater than or equal to 0"));
   auto buffer = ir::_Buffer_::Make(std::move(name), std::move(t));
 
   if (data_alignment) {
diff --git a/paddle/cinn/backends/llvm/codegen_x86.cc b/paddle/cinn/backends/llvm/codegen_x86.cc
index cfd796162241c..5987e3af7a7c3 100644
--- a/paddle/cinn/backends/llvm/codegen_x86.cc
+++ b/paddle/cinn/backends/llvm/codegen_x86.cc
@@ -30,7 +30,7 @@
 #include "paddle/cinn/ir/op/ir_operators.h"
 #include "paddle/cinn/ir/utils/ir_nodes_collector.h"
 #include "paddle/cinn/runtime/intrinsic.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn::backends {
 
 CodeGenX86::CodeGenX86(llvm::Module* m,
@@ -144,8 +144,10 @@ void CodeGenX86::CreateParallelLaunch(Expr body, int num_task) {
   symbol_table_->PopScope();
   std::swap(parallel_env_, par_env);
   std::swap(f_, f);
-  CHECK_NE(par_env.parallel_loop_count, 0)
-      << "find no parallel loop within parallel launch";
+  PADDLE_ENFORCE_NE(par_env.parallel_loop_count,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "find no parallel loop within parallel launch"));
   b_->SetInsertPoint(launch_end);
 }
 
diff --git a/paddle/cinn/backends/llvm/execution_engine.cc b/paddle/cinn/backends/llvm/execution_engine.cc
index 050fd4e0d8389..8a84d69a1d7a0 100644
--- a/paddle/cinn/backends/llvm/execution_engine.cc
+++ b/paddle/cinn/backends/llvm/execution_engine.cc
@@ -166,17 +166,20 @@ std::unique_ptr<llvm::MemoryBuffer> NaiveObjectCache::getObject(
 
   VLOG(2) << "===================== Create CINN ExecutionEngine end "
              "====================";
+  engine->ctx = std::make_unique<llvm::LLVMContext>();
+  engine->b = std::make_unique<llvm::IRBuilder<>>(*engine->ctx);
+  llvm::SMDiagnostic error;
+  engine->m = llvm::parseAssemblyString(
+      AsStringRef(backends::kRuntimeLlvmIr), error, *engine->ctx);
+
   return engine;
 }
 
 template <typename CodeGenT>
-void ExecutionEngine::Link(const ir::Module &module) {
+void ExecutionEngine::Link(const ir::Module &module, bool add_module) {
   utils::RecordEvent("ExecutionEngine Link", utils::EventType::kOrdinary);
   llvm::SMDiagnostic error;
-  auto ctx = std::make_unique<llvm::LLVMContext>();
-  auto m = llvm::parseAssemblyString(
-      AsStringRef(backends::kRuntimeLlvmIr), error, *ctx);
-  auto b = std::make_unique<llvm::IRBuilder<>>(*ctx);
+
   auto ir_emitter = std::make_unique<CodeGenT>(m.get(), b.get());
   VLOG(3) << "ir_emitter->Compile(module) Begin";
   ir_emitter->Compile(module);
@@ -200,7 +203,9 @@ void ExecutionEngine::Link(const ir::Module &module) {
       pass_manager, rawstream, nullptr, llvm::CGFT_ObjectFile);
   pass_manager.run(*m);
 
-  CHECK(AddModule(std::move(m), std::move(ctx)));
+  if (add_module) {
+    AddSelfModule();
+  }
 
   if (VLOG_IS_ON(5)) {
     VLOG(5) << "======= dump jit execution session ======";
@@ -231,6 +236,9 @@ bool ExecutionEngine::AddModule(std::unique_ptr<llvm::Module> module,
   llvm::cantFail(jit_->addIRModule(std::move(tsm)));
   return true;
 }
+bool ExecutionEngine::AddSelfModule() {
+  return AddModule(std::move(m), std::move(ctx));
+}
 
 void ExecutionEngine::ExportObject(const std::string &path) {
   FILE *of = fopen(path.c_str(), "w");
@@ -268,8 +276,11 @@ void ExecutionEngine::RegisterRuntimeSymbols() {
   }
 }
 
-template void ExecutionEngine::Link<CodeGenLLVM>(const ir::Module &module);
-template void ExecutionEngine::Link<CodeGenX86>(const ir::Module &module);
-template void ExecutionEngine::Link<CodeGenCUDA_Host>(const ir::Module &module);
+template void ExecutionEngine::Link<CodeGenLLVM>(const ir::Module &module,
+                                                 bool add_module);
+template void ExecutionEngine::Link<CodeGenX86>(const ir::Module &module,
+                                                bool add_module);
+template void ExecutionEngine::Link<CodeGenCUDA_Host>(const ir::Module &module,
+                                                      bool add_module);
 
 }  // namespace cinn::backends
diff --git a/paddle/cinn/backends/llvm/execution_engine.h b/paddle/cinn/backends/llvm/execution_engine.h
index 63f9427a53edb..44b212f245f90 100644
--- a/paddle/cinn/backends/llvm/execution_engine.h
+++ b/paddle/cinn/backends/llvm/execution_engine.h
@@ -79,18 +79,22 @@ class ExecutionEngine {
   void *Lookup(absl::string_view name);
 
   template <typename CodeGenT = CodeGenLLVM>
-  void Link(const ir::Module &module);
+  void Link(const ir::Module &module, bool add_module = true);
 
   void ExportObject(const std::string &path);
 
   bool AddModule(std::unique_ptr<llvm::Module> module,
                  std::unique_ptr<llvm::LLVMContext> context);
 
+  bool AddSelfModule();
+
  protected:
   explicit ExecutionEngine(bool enable_object_cache,
                            RuntimeSymbols &&module_symbols)
       : cache_(std::make_unique<NaiveObjectCache>()),
-        module_symbols_(std::move(module_symbols)) {}
+        module_symbols_(std::move(module_symbols)),
+        ctx(std::make_unique<llvm::LLVMContext>()),
+        b(std::make_unique<llvm::IRBuilder<>>(*ctx)) {}
 
   void RegisterRuntimeSymbols();
 
@@ -106,6 +110,10 @@ class ExecutionEngine {
   std::unique_ptr<llvm::orc::LLJIT> jit_;
   std::unique_ptr<NaiveObjectCache> cache_;
   RuntimeSymbols module_symbols_;
+
+  std::unique_ptr<llvm::LLVMContext> ctx;
+  std::unique_ptr<llvm::Module> m;
+  std::unique_ptr<llvm::IRBuilder<>> b;
 };
 
 }  // namespace cinn::backends
diff --git a/paddle/cinn/backends/llvm/execution_engine_test.cc b/paddle/cinn/backends/llvm/execution_engine_test.cc
index a13f329a81259..beb3ec61fae25 100644
--- a/paddle/cinn/backends/llvm/execution_engine_test.cc
+++ b/paddle/cinn/backends/llvm/execution_engine_test.cc
@@ -26,7 +26,6 @@
 #include <llvm/Support/MemoryBuffer.h>
 #include <llvm/Support/SourceMgr.h>
 #include <llvm/Support/raw_ostream.h>
-
 #include <algorithm>
 #include <cmath>
 #include <iomanip>
@@ -35,6 +34,7 @@
 #include <tuple>
 #include <utility>
 #include <vector>
+#include "paddle/common/enforce.h"
 
 #include "paddle/cinn/backends/llvm/cinn_runtime_llvm_ir.h"
 #include "paddle/cinn/backends/llvm/codegen_llvm.h"
@@ -91,7 +91,11 @@ auto CreateTestBuffer() {
   }
 
   float *Cd = reinterpret_cast<float *>(C->memory);
-  CHECK_EQ(C->num_elements(), A->num_elements());
+  PADDLE_ENFORCE_EQ(
+      C->num_elements(),
+      A->num_elements(),
+      phi::errors::InvalidArgument(
+          "The number of elements of C and A should be the same."));
 
   return std::make_tuple(A, B, C);
 }
diff --git a/paddle/cinn/backends/llvm/llvm_intrin_rule.h b/paddle/cinn/backends/llvm/llvm_intrin_rule.h
index 903c056196f4e..14e3718299c0f 100644
--- a/paddle/cinn/backends/llvm/llvm_intrin_rule.h
+++ b/paddle/cinn/backends/llvm/llvm_intrin_rule.h
@@ -26,17 +26,24 @@
 #include "paddle/cinn/ir/intrinsic_ops.h"
 #include "paddle/cinn/ir/registry.h"
 #include "paddle/cinn/lang/packed_func.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace codegen {
 
 template <int id, int arg_nums, bool add_float_suffix = true>
 inline void MakeFloatIntrinOp(lang::Args args, lang::RetValue *rv) {
-  CHECK_GE(args.size(), 1U);
+  PADDLE_ENFORCE_GE(args.size(),
+                    1U,
+                    phi::errors::InvalidArgument(
+                        "The number of args should be greater than 1."));
   Expr arg = args[0];
   ir::Call *node = arg->as<ir::Call>();
   CHECK(node);
-  CHECK_GE(node->read_args.size(), arg_nums);
+  PADDLE_ENFORCE_GE(
+      node->read_args.size(),
+      arg_nums,
+      phi::errors::InvalidArgument(
+          "The number of read args should be greater than arg_nums."));
   if (add_float_suffix) {
     CHECK(node->type().is_float());
     *rv = ir::intrinsics::BuiltinIntrin::Make(
@@ -85,7 +92,10 @@ void RegisterCpuIntrinRule() {
 
   ir::Registry::Register("lower_cpu_intrinsic_isfinite", true)
       .SetBody([](lang::Args args, lang::RetValue *rv) {
-        CHECK_GE(args.size(), 1U);
+        PADDLE_ENFORCE_GE(args.size(),
+                          1U,
+                          phi::errors::InvalidArgument(
+                              "The number of args should be greater than 1."));
         Expr arg0 = args[0];
         ir::Call *node = arg0->as<ir::Call>();
         CHECK(node);
@@ -96,7 +106,10 @@ void RegisterCpuIntrinRule() {
 
   ir::Registry::Register("lower_cpu_intrinsic_isinf", true)
       .SetBody([](lang::Args args, lang::RetValue *rv) {
-        CHECK_GE(args.size(), 1U);
+        PADDLE_ENFORCE_GE(args.size(),
+                          1U,
+                          phi::errors::InvalidArgument(
+                              "The number of args should be greater than 1."));
         Expr arg0 = args[0];
         ir::Call *node = arg0->as<ir::Call>();
         CHECK(node);
@@ -113,7 +126,10 @@ void RegisterCpuIntrinRule() {
 
   ir::Registry::Register("lower_cpu_intrinsic_rsqrt", true)
       .SetBody([](lang::Args args, lang::RetValue *rv) {
-        CHECK_GE(args.size(), 1U);
+        PADDLE_ENFORCE_GE(args.size(),
+                          1U,
+                          phi::errors::InvalidArgument(
+                              "The number of args should be greater than 1."));
         Expr arg0 = args[0];
         ir::Call *node = arg0->as<ir::Call>();
         CHECK(node);
@@ -124,7 +140,10 @@ void RegisterCpuIntrinRule() {
 
   ir::Registry::Register("lower_cpu_intrinsic_exp10", true)
       .SetBody([](lang::Args args, lang::RetValue *rv) {
-        CHECK_GE(args.size(), 1U);
+        PADDLE_ENFORCE_GE(args.size(),
+                          1U,
+                          phi::errors::InvalidArgument(
+                              "The number of args should be greater than 1."));
         Expr arg0 = args[0];
         ir::Call *node = arg0->as<ir::Call>();
         CHECK(node);
@@ -136,7 +155,10 @@ void RegisterCpuIntrinRule() {
 
   ir::Registry::Register("lower_cpu_intrinsic_tan", true)
       .SetBody([](lang::Args args, lang::RetValue *rv) {
-        CHECK_GE(args.size(), 1U);
+        PADDLE_ENFORCE_GE(args.size(),
+                          1U,
+                          phi::errors::InvalidArgument(
+                              "The number of args should be greater than 1."));
         Expr arg0 = args[0];
         ir::Call *node = arg0->as<ir::Call>();
         CHECK(node);
@@ -147,7 +169,10 @@ void RegisterCpuIntrinRule() {
 
   ir::Registry::Register("lower_cpu_intrinsic_tanh", true)
       .SetBody([](lang::Args args, lang::RetValue *rv) {
-        CHECK_GE(args.size(), 1U);
+        PADDLE_ENFORCE_GE(args.size(),
+                          1U,
+                          phi::errors::InvalidArgument(
+                              "The number of args should be greater than 1."));
         Expr arg0 = args[0];
         ir::Call *node = arg0->as<ir::Call>();
         CHECK(node);
@@ -168,7 +193,10 @@ void RegisterCpuIntrinRule() {
 
   ir::Registry::Register("lower_cpu_intrinsic_cosh", true)
       .SetBody([](lang::Args args, lang::RetValue *rv) {
-        CHECK_GE(args.size(), 1U);
+        PADDLE_ENFORCE_GE(args.size(),
+                          1U,
+                          phi::errors::InvalidArgument(
+                              "The number of args should be greater than 1."));
         Expr arg0 = args[0];
         ir::Call *node = arg0->as<ir::Call>();
         CHECK(node);
@@ -180,7 +208,10 @@ void RegisterCpuIntrinRule() {
 
   ir::Registry::Register("lower_cpu_intrinsic_sinh", true)
       .SetBody([](lang::Args args, lang::RetValue *rv) {
-        CHECK_GE(args.size(), 1U);
+        PADDLE_ENFORCE_GE(args.size(),
+                          1U,
+                          phi::errors::InvalidArgument(
+                              "The number of args should be greater than 1."));
         Expr arg0 = args[0];
         ir::Call *node = arg0->as<ir::Call>();
         CHECK(node);
diff --git a/paddle/cinn/backends/llvm/llvm_optimizer.cc b/paddle/cinn/backends/llvm/llvm_optimizer.cc
index e64fb9f42ee0b..22f9a37351664 100644
--- a/paddle/cinn/backends/llvm/llvm_optimizer.cc
+++ b/paddle/cinn/backends/llvm/llvm_optimizer.cc
@@ -74,12 +74,12 @@ class CustomPassManager : public PassManagerT {
   void add(llvm::Pass *pass) override {
     if (print_passes_) {
       if (is_function_pass_manager_) {
-        VLOG(1) << "llvm run function pass[" << std::string(pass->getPassName())
+        VLOG(4) << "llvm run function pass[" << std::string(pass->getPassName())
                 << "]";
       }
 
       if (is_module_pass_manager_) {
-        VLOG(1) << "llvm run module pass[" << std::string(pass->getPassName())
+        VLOG(4) << "llvm run module pass[" << std::string(pass->getPassName())
                 << "]";
       }
     }
diff --git a/paddle/cinn/backends/llvm/runtime_symbol_registry.cc b/paddle/cinn/backends/llvm/runtime_symbol_registry.cc
index 3885ebe0c4199..52dbe7f024307 100644
--- a/paddle/cinn/backends/llvm/runtime_symbol_registry.cc
+++ b/paddle/cinn/backends/llvm/runtime_symbol_registry.cc
@@ -20,8 +20,8 @@
 #include <iostream>
 
 #include "paddle/cinn/runtime/flags.h"
+#include "paddle/common/enforce.h"
 #include "paddle/common/flags.h"
-
 PD_DECLARE_bool(verbose_function_register);
 
 namespace cinn {
@@ -51,8 +51,10 @@ void RuntimeSymbols::Register(const std::string &name, void *address) {
   std::lock_guard<std::mutex> lock(mu_);
   auto it = symbols_.find(name);
   if (it != symbols_.end()) {
-    CHECK_EQ(it->second, address)
-        << "Duplicate register symbol [" << name << "]";
+    PADDLE_ENFORCE_EQ(
+        it->second,
+        address,
+        phi::errors::InvalidArgument("Duplicate register symbol"));
     return;
   }
 
diff --git a/paddle/cinn/backends/modular.cc b/paddle/cinn/backends/modular.cc
index fb736154c7bfc..f735b8b6da56a 100644
--- a/paddle/cinn/backends/modular.cc
+++ b/paddle/cinn/backends/modular.cc
@@ -15,7 +15,7 @@
 #include "paddle/cinn/backends/modular.h"
 
 #include "paddle/cinn/ir/ir_visitor.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace backends {
 
@@ -104,8 +104,14 @@ class ModularEvaluator : public ir::IRVisitorRequireReImpl<ModularEntry> {
   }
 
   static int gcd(int a, int b) {
-    CHECK_GE(a, 0);
-    CHECK_GE(b, 0);
+    PADDLE_ENFORCE_GE(
+        a,
+        0,
+        phi::errors::InvalidArgument("a should be greater than or equal to 0"));
+    PADDLE_ENFORCE_GE(
+        b,
+        0,
+        phi::errors::InvalidArgument("b should be greater than or equal to 0"));
     if (a < b) std::swap(a, b);
     if (b == 0) return a;
 
diff --git a/paddle/cinn/backends/nvrtc/header_generator.cc b/paddle/cinn/backends/nvrtc/header_generator.cc
index d4b2b9504673f..7d88ed16d0413 100644
--- a/paddle/cinn/backends/nvrtc/header_generator.cc
+++ b/paddle/cinn/backends/nvrtc/header_generator.cc
@@ -16,7 +16,7 @@
 
 #include "glog/logging.h"
 #include "jitify.hpp"  // NOLINT
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace backends {
 namespace nvrtc {
@@ -27,8 +27,10 @@ HeaderGeneratorBase& JitSafeHeaderGenerator::GetInstance() {
 }
 
 const size_t JitSafeHeaderGenerator::size() const {
-  CHECK_EQ(include_names_.size(), headers_.size())
-      << "Internal error in size of header files.";
+  PADDLE_ENFORCE_EQ(
+      include_names_.size(),
+      headers_.size(),
+      phi::errors::InvalidArgument("Internal error in size of header files."));
   return include_names_.size();
 }
 
diff --git a/paddle/cinn/backends/nvrtc/nvrtc_util.cc b/paddle/cinn/backends/nvrtc/nvrtc_util.cc
index 737d887ea809c..1b887268a1ae8 100644
--- a/paddle/cinn/backends/nvrtc/nvrtc_util.cc
+++ b/paddle/cinn/backends/nvrtc/nvrtc_util.cc
@@ -29,7 +29,7 @@
 #include "paddle/cinn/common/common.h"
 #include "paddle/cinn/runtime/flags.h"
 #include "paddle/cinn/utils/string.h"
-
+#include "paddle/common/enforce.h"
 PD_DECLARE_string(cinn_nvcc_cmd_path);
 PD_DECLARE_string(nvidia_package_dir);
 PD_DECLARE_bool(nvrtc_compile_to_cubin);
@@ -187,7 +187,9 @@ std::string Compiler::CompileCudaSource(const std::string& code,
     std::string log;
     log.resize(log_size);
     NVRTC_CALL(nvrtcGetProgramLog(prog, &log[0]));
-    CHECK_EQ(compile_res, NVRTC_SUCCESS) << log << "\nThe code is:\n" << code;
+    PADDLE_ENFORCE_EQ(compile_res,
+                      NVRTC_SUCCESS,
+                      phi::errors::Fatal("NVRTC compilation failed"));
   }
 
   size_t size;
diff --git a/paddle/cinn/common/arithmetic.cc b/paddle/cinn/common/arithmetic.cc
index e2c4ed1b8a6a7..08bf94724dedb 100644
--- a/paddle/cinn/common/arithmetic.cc
+++ b/paddle/cinn/common/arithmetic.cc
@@ -25,7 +25,7 @@
 #include "paddle/cinn/ir/ir_visitor.h"
 #include "paddle/cinn/ir/op/ir_operators.h"
 #include "paddle/cinn/utils/string.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace common {
 
@@ -222,7 +222,10 @@ class GiNaCToExprVisitor : public GiNaC::symbol::visitor,
 
     auto* intv = cur.As<IntImm>();
     CHECK(intv);
-    CHECK_EQ(intv->value, -1);
+    PADDLE_ENFORCE_EQ(
+        intv->value,
+        -1,
+        phi::errors::InvalidArgument("The power value should be -1."));
 
     cur = Div::Make(Expr(1), a);
   }
@@ -296,9 +299,12 @@ std::tuple<Expr, bool /*positive*/> Solve(Expr lhs, Expr rhs, Var var) {
   ginac::lst vars{symbol};
   ginac::ex res = ginac::lsolve(eqs, vars);
 
-  CHECK_EQ(res.nops(), 1);
+  PADDLE_ENFORCE_EQ(
+      res.nops(), 1, phi::errors::InvalidArgument("The res npos should be 1."));
   auto item = res.op(0);
-  CHECK_EQ(item.nops(), 2);
+  PADDLE_ENFORCE_EQ(item.nops(),
+                    2,
+                    phi::errors::InvalidArgument("The item npos should be 2."));
   Expr value = converter.GinacToExpr(item.op(1));
 
   // tell the symbol
diff --git a/paddle/cinn/common/broadcast_tree.cc b/paddle/cinn/common/broadcast_tree.cc
index 74ed4aff42798..f50e06bed4fd4 100644
--- a/paddle/cinn/common/broadcast_tree.cc
+++ b/paddle/cinn/common/broadcast_tree.cc
@@ -17,8 +17,12 @@
 #include <optional>
 #include <unordered_map>
 
+#include "paddle/common/enforce.h"
+#include "paddle/common/flags.h"
 #include "paddle/pir/include/dialect/shape/utils/dim_expr_util.h"
 
+COMMON_DECLARE_int64(pir_broadcast_tree_limit);
+
 namespace cinn::common {
 
 namespace {
@@ -91,6 +95,9 @@ template <typename DoEachT>
 bool SearchBroadcastImpl(const symbol::Broadcast<symbol::DimExpr>& variadic,
                          const DoEachT& DoEach) {
   const auto& operands = *(variadic.operands);
+  if (operands.size() > 3) {
+    PADDLE_THROW(phi::errors::Fatal("Too many broadcast leaves to compile!"));
+  }
   for (const auto& operand : operands) {
     CHECK(!operand.isa<int64_t>());
     if (SearchBroadcast(operand, DoEach)) return true;
@@ -213,21 +220,22 @@ BroadcastLeaf GetCstrRhsEqOneLeaves(
 
 BroadcastBranch<BroadcastTree> ConstructBroadcastBranch(
     const symbol::Broadcastable<symbol::DimExpr>& broadcastable_condition,
-    const BroadcastLeaf& leaves) {
+    const BroadcastLeaf& leaves,
+    int* num_of_leaves) {
   BroadcastLeaf cstr_lhs_eq_rhs_leaves =
       GetCstrLhsEqRhsLeaves(broadcastable_condition, leaves);
   BroadcastLeaf cstr_lhs_eq_one_leaves =
       GetCstrLhsEqOneLeaves(broadcastable_condition, leaves);
   BroadcastLeaf cstr_rhs_eq_one_leaves =
       GetCstrRhsEqOneLeaves(broadcastable_condition, leaves);
-  // clang-format off
   return BroadcastBranch<BroadcastTree>{
       /*broadcastable_condition*/ broadcastable_condition,
-      /*cstr_lhs_eq_rhs_branch*/ ConstructBroadcastTree(cstr_lhs_eq_rhs_leaves),
-      /*cstr_lhs_eq_one_branch*/ ConstructBroadcastTree(cstr_lhs_eq_one_leaves),
-      /*cstr_rhs_eq_one_branch*/ ConstructBroadcastTree(cstr_rhs_eq_one_leaves)
-    };
-  // clang-format on
+      /*cstr_lhs_eq_rhs_branch*/
+      ConstructBroadcastTree(cstr_lhs_eq_rhs_leaves, num_of_leaves),
+      /*cstr_lhs_eq_one_branch*/
+      ConstructBroadcastTree(cstr_lhs_eq_one_leaves, num_of_leaves),
+      /*cstr_rhs_eq_one_branch*/
+      ConstructBroadcastTree(cstr_rhs_eq_one_leaves, num_of_leaves)};
 }
 
 }  // namespace
@@ -288,7 +296,10 @@ std::optional<symbol::Broadcastable<symbol::DimExpr>> GetFirstCstrBroadcastable(
   if (ret.has_value()) return ret.value();
   ForEachBroadcastDimExpr(leaves, [&](const auto& broadcast) -> bool {
     const auto& operands = broadcast.operands;
-    CHECK_GE(operands->size(), 2);
+    PADDLE_ENFORCE_GE(operands->size(),
+                      2,
+                      phi::errors::InvalidArgument(
+                          "The operands size should be greater than 2."));
     CHECK(operands->at(0) != operands->at(1));
     ret = symbol::Broadcastable<symbol::DimExpr>{operands->at(0),
                                                  operands->at(1)};
@@ -297,11 +308,19 @@ std::optional<symbol::Broadcastable<symbol::DimExpr>> GetFirstCstrBroadcastable(
   return ret;
 }
 
-BroadcastTree ConstructBroadcastTree(const BroadcastLeaf& leaves) {
+BroadcastTree ConstructBroadcastTree(const BroadcastLeaf& leaves,
+                                     int* num_of_leaves) {
   std::optional<symbol::Broadcastable<symbol::DimExpr>>
       broadcastable_condition = GetFirstCstrBroadcastable(leaves);
-  if (!broadcastable_condition.has_value()) return leaves;
-  return ConstructBroadcastBranch(broadcastable_condition.value(), leaves);
+  if (!broadcastable_condition.has_value()) {
+    (*num_of_leaves)++;
+    if (*num_of_leaves > FLAGS_pir_broadcast_tree_limit) {
+      PADDLE_THROW(phi::errors::Fatal("Too many broadcast leaves to compile!"));
+    }
+    return leaves;
+  }
+  return ConstructBroadcastBranch(
+      broadcastable_condition.value(), leaves, num_of_leaves);
 }
 
 namespace {
diff --git a/paddle/cinn/common/broadcast_tree.h b/paddle/cinn/common/broadcast_tree.h
index 5b8c051299af8..eee72a1f3cd38 100644
--- a/paddle/cinn/common/broadcast_tree.h
+++ b/paddle/cinn/common/broadcast_tree.h
@@ -29,7 +29,8 @@ using BroadcastLeaf = adt::List<std::vector<symbol::DimExpr>>;
 
 using BroadcastTree = adt::Tree<BroadcastBranch, BroadcastLeaf>;
 
-BroadcastTree ConstructBroadcastTree(const BroadcastLeaf& leaves);
+BroadcastTree ConstructBroadcastTree(const BroadcastLeaf& leaves,
+                                     int* num_of_leaves);
 
 std::string ToTxtString(const BroadcastTree&);
 
diff --git a/paddle/cinn/common/broadcast_tree_test.cc b/paddle/cinn/common/broadcast_tree_test.cc
index 8a09e8abd7dee..0484d38690dd2 100644
--- a/paddle/cinn/common/broadcast_tree_test.cc
+++ b/paddle/cinn/common/broadcast_tree_test.cc
@@ -66,7 +66,8 @@ TEST(BroadcastTree, Naive) {
                                     MakeBroadcastDimExpr(expr1, expr2),
                                     MakeBroadcastDimExpr(expr3, expr4)};
   BroadcastLeaf leaf = adt::List<std::vector<DimExpr>>{tensor_shape};
-  BroadcastTree tree = ConstructBroadcastTree(leaf);
+  int num_of_leaves = 0;
+  BroadcastTree tree = ConstructBroadcastTree(leaf, &num_of_leaves);
   ASSERT_TRUE(tree.Has<BroadcastBranch<BroadcastTree>>());
   const auto& branch = tree.Get<BroadcastBranch<BroadcastTree>>();
   const auto& [cstr_broadcastable,
@@ -96,7 +97,8 @@ TEST(BroadcastTree, SimplifyConstantBroadcast) {
                                     MakeBroadcastDimExpr(expr1, expr2),
                                     MakeBroadcastDimExpr(expr3, expr4)};
   BroadcastLeaf leaf = adt::List<std::vector<DimExpr>>{tensor_shape};
-  BroadcastTree tree = ConstructBroadcastTree(leaf);
+  int num_of_leaves = 0;
+  BroadcastTree tree = ConstructBroadcastTree(leaf, &num_of_leaves);
   ASSERT_TRUE(tree.Has<BroadcastBranch<BroadcastTree>>());
   const auto& branch = tree.Get<BroadcastBranch<BroadcastTree>>();
   const auto& [cstr_broadcastable,
diff --git a/paddle/cinn/common/cas.cc b/paddle/cinn/common/cas.cc
index 3b4f2e7f2f3d9..4b1021f3dcc2a 100644
--- a/paddle/cinn/common/cas.cc
+++ b/paddle/cinn/common/cas.cc
@@ -28,7 +28,7 @@
 #include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/ir/utils/ir_nodes_collector.h"
 #include "paddle/cinn/utils/string.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace common {
 using namespace ir;  // NOLINT
@@ -37,6 +37,9 @@ Expr AutoSimplify(
     Expr u,
     const absl::flat_hash_map<std::string, CasInterval>& var_intervals) {
   VLOG(7) << "Begin AutoSimplify: " << u;
+  if (u.type().is_float()) {
+    return u;
+  }
   u = detail::ConvertCinnToCAS(u);
   absl::flat_hash_map<std::string, CasInterval> s_var_intervals;
   for (auto& item : var_intervals) {
@@ -136,7 +139,8 @@ namespace detail {
 // Is a Divisible to b.
 // @{
 bool IsDivisible(int64_t a, int64_t b) {
-  CHECK_NE(b, 0);
+  PADDLE_ENFORCE_NE(
+      b, 0, phi::errors::InvalidArgument("The divisor %d should not be 0.", b));
   return a % b == 0;
 }
 bool IsDivisible(const Sum* a, int b);
@@ -1482,7 +1486,10 @@ Expr CasSimplifyMutator::SimplifySpecificSum(Expr tmp) {
   if (!right_mod || (!left_mul && !left_div)) {
     return tmp;
   }
-  CHECK_GE(right_mod->operands().size(), 2U);
+  PADDLE_ENFORCE_GE(right_mod->operands().size(),
+                    2U,
+                    phi::errors::InvalidArgument(
+                        "right_mod's operands size should be greater than 2"));
   Expr mod_left = right_mod->operand(0);
   Expr mod_right = right_mod->operand(1);
   if (!mod_left->type().is_integer() || !mod_right->type().is_integer()) {
@@ -1492,7 +1499,10 @@ Expr CasSimplifyMutator::SimplifySpecificSum(Expr tmp) {
     // case 1: (m / n) * n + m % n = m (m, n's type is int)
     // case 2: (m / n1) * n3 + (n2 * m) % n3 = n2 * m if n3 = n1 * n2 (m, n1,
     // n2, n3's type is int)
-    CHECK_GE(left_mul->operands().size(), 2U);
+    PADDLE_ENFORCE_GE(left_mul->operands().size(),
+                      2U,
+                      phi::errors::InvalidArgument(
+                          "left_mul's operands size should be greater than 2"));
     Expr mul_left = left_mul->operand(0);
     Expr mul_right = left_mul->operand(1);
 
@@ -1509,7 +1519,10 @@ Expr CasSimplifyMutator::SimplifySpecificSum(Expr tmp) {
     if (!div) {
       return tmp;
     }
-    CHECK_GE(div->operands().size(), 2U);
+    PADDLE_ENFORCE_GE(div->operands().size(),
+                      2U,
+                      phi::errors::InvalidArgument(
+                          "div's operands size should be greater than 2"));
     Expr div_left = div->operand(0);
     Expr div_right = div->operand(1);
     if (!div_left->type().is_integer() || !div_right->type().is_integer()) {
diff --git a/paddle/cinn/common/cas.h b/paddle/cinn/common/cas.h
old mode 100755
new mode 100644
index 7fbd0bfe6aa00..729d8e40c0db7
--- a/paddle/cinn/common/cas.h
+++ b/paddle/cinn/common/cas.h
@@ -22,7 +22,7 @@
 #include "paddle/cinn/ir/ir.h"
 #include "paddle/cinn/ir/ir_printer.h"
 #include "paddle/cinn/optim/ir_simplify.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace common {
 
@@ -37,7 +37,10 @@ Expr ReplaceMaxToConstant(Expr expr);
 struct CasInterval {
   template <typename T>
   CasInterval(T l, T r) : l(l), r(r) {
-    CHECK_LE(l, r) << "left should not be larger than right";
+    PADDLE_ENFORCE_LE(
+        l,
+        r,
+        phi::errors::InvalidArgument("left should not be larger than right"));
   }
 
   /**
@@ -51,12 +54,12 @@ struct CasInterval {
    * 1 <= iterator_i <= 5
    */
   CasInterval(Expr expr_l, Expr expr_r) {
-    VLOG(2) << "CasInterval is : [" << expr_l << ", " << expr_r << "].";
+    VLOG(6) << "CasInterval is : [" << expr_l << ", " << expr_r << "].";
     expr_r = detail::ReplaceMinToConstant(expr_r);
     expr_l = detail::ReplaceMaxToConstant(expr_l);
     optim::Simplify(&expr_l);
     optim::Simplify(&expr_r);
-    VLOG(2) << "After simplify, CasInterval is : [" << expr_l << ", " << expr_r
+    VLOG(6) << "After simplify, CasInterval is : [" << expr_l << ", " << expr_r
             << "].";
 
     if (expr_l.is_constant() && expr_r.is_constant()) {
diff --git a/paddle/cinn/common/cas_test.cc b/paddle/cinn/common/cas_test.cc
index c0b614eb972fa..62ca04e85467f 100644
--- a/paddle/cinn/common/cas_test.cc
+++ b/paddle/cinn/common/cas_test.cc
@@ -458,9 +458,6 @@ TEST(CAS, cond) {
 TEST(CAS, SimplifyFracOp) {
   Expr frac = Expr(1) / Expr(7) / Expr(6) / Expr(5) / Expr(4);
   EXPECT_EQ(GetStreamCnt(AutoSimplify(frac)), "0");
-
-  Expr frac_f = Expr(20.0f) / Expr(2.0f) / Expr(1.0f) / Expr(5.0f);
-  EXPECT_EQ(GetStreamCnt(AutoSimplify(frac_f)), "2.00000000f");
 }
 
 }  // namespace common
diff --git a/paddle/cinn/common/cinn_value.cc b/paddle/cinn/common/cinn_value.cc
index 3b25f93201333..82a3c86b12720 100644
--- a/paddle/cinn/common/cinn_value.cc
+++ b/paddle/cinn/common/cinn_value.cc
@@ -18,7 +18,7 @@
 #include "paddle/cinn/ir/ir_base.h"
 #include "paddle/cinn/poly/stage.h"
 #include "paddle/cinn/runtime/cinn_runtime.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 
 namespace ir {
@@ -128,23 +128,38 @@ bool CINNValue::is_tensor() const {
 }
 
 CINNValue::operator std::string() const {
-  CHECK_EQ(type_code_, TypeCode<std::string>());
+  PADDLE_ENFORCE_EQ(
+      type_code_,
+      TypeCode<std::string>(),
+      phi::errors::InvalidArgument("The type_code is not std::string."));
   return absl::any_cast<std::string>(shared_);
 }
 CINNValue::operator ir::Var() const {
-  CHECK_EQ(type_code_, TypeCode<ir::Var>());
+  PADDLE_ENFORCE_EQ(
+      type_code_,
+      TypeCode<ir::Var>(),
+      phi::errors::InvalidArgument("The type_code is not ir::Var."));
   return absl::any_cast<ir::Var>(shared_);
 }
 CINNValue::operator ir::Expr() const {
-  CHECK_EQ(type_code_, TypeCode<ir::Expr>());
+  PADDLE_ENFORCE_EQ(
+      type_code_,
+      TypeCode<ir::Expr>(),
+      phi::errors::InvalidArgument("The type_code is not ir::Expr."));
   return absl::any_cast<Expr>(shared_);
 }
 CINNValue::operator CINNValuePack() const {
-  CHECK_EQ(type_code_, TypeCode<CINNValuePack>());
+  PADDLE_ENFORCE_EQ(
+      type_code_,
+      TypeCode<CINNValuePack>(),
+      phi::errors::InvalidArgument("The type_code is not CINNValuePack."));
   return absl::any_cast<CINNValuePack>(shared_);
 }
 CINNValue::operator poly::StageMap() const {
-  CHECK_EQ(type_code(), TypeCode<poly::StageMap>());
+  PADDLE_ENFORCE_EQ(
+      type_code(),
+      TypeCode<poly::StageMap>(),
+      phi::errors::InvalidArgument("The type_code is not poly::StageMap."));
   return absl::any_cast<poly::StageMap>(shared_);
 }
 CINNValue::CINNValue(char *value)
@@ -181,11 +196,17 @@ CINNValuePack _CINNValuePack_::Make(const std::vector<CINNValue> &array) {
   return CINNValuePack(node);
 }
 CINNValue &_CINNValuePack_::operator[](int offset) {
-  CHECK_LT(offset, size());
+  PADDLE_ENFORCE_LT(
+      offset,
+      size(),
+      phi::errors::InvalidArgument("The offset is out of range."));
   return values_[offset];
 }
 const CINNValue &_CINNValuePack_::operator[](int offset) const {
-  CHECK_LT(offset, size());
+  PADDLE_ENFORCE_LT(
+      offset,
+      size(),
+      phi::errors::InvalidArgument("The offset is out of range."));
   return values_[offset];
 }
 void _CINNValuePack_::AddValue(const CINNValue &value) {
diff --git a/paddle/cinn/common/cinn_value.h b/paddle/cinn/common/cinn_value.h
old mode 100755
new mode 100644
index 3cfb4214d76b9..aa64b129df673
--- a/paddle/cinn/common/cinn_value.h
+++ b/paddle/cinn/common/cinn_value.h
@@ -23,7 +23,7 @@
 #include "paddle/cinn/common/object.h"
 #include "paddle/cinn/common/type.h"
 #include "paddle/cinn/runtime/cinn_runtime.h"
-
+#include "paddle/common/enforce.h"
 struct cinn_buffer_t;
 
 namespace cinn {
@@ -97,12 +97,18 @@ struct CINNValuePack : public Shared<_CINNValuePack_> {
   bool empty() const { return (*operator->()).empty(); }
 
   CINNValue& back() {
-    CHECK_GT((*operator->()).size(), 0);
+    PADDLE_ENFORCE_GT((*operator->()).size(),
+                      0,
+                      phi::errors::InvalidArgument(
+                          "The size of the array should greater than 0."));
     return (*operator->())[size() - 1];
   }
 
   const CINNValue& back() const {
-    CHECK_GT((*operator->()).size(), 0);
+    PADDLE_ENFORCE_GT((*operator->()).size(),
+                      0,
+                      phi::errors::InvalidArgument(
+                          "The size of the array should greater than 0."));
     return (*operator->())[size() - 1];
   }
 
diff --git a/paddle/cinn/common/float16_bfloat16_cuda_test.cu b/paddle/cinn/common/float16_bfloat16_cuda_test.cu
index fd6c39cc51f8f..5cded20e9cadf 100644
--- a/paddle/cinn/common/float16_bfloat16_cuda_test.cu
+++ b/paddle/cinn/common/float16_bfloat16_cuda_test.cu
@@ -39,9 +39,15 @@ class CudaMem {
   CudaMem() = default;
 
   void* mutable_data(size_t bytes) {
-    CHECK_GT(bytes, 0) << "Cannot allocate empty memory!";
+    PADDLE_ENFORCE_GT(
+        bytes,
+        0,
+        phi::errors::InvalidArgument("Cannot allocate empty memory!"));
     if (ptr) {
-      CHECK_EQ(bytes, bytes_) << "Try allocate memory twice!";
+      PADDLE_ENFORCE_EQ(
+          bytes,
+          bytes_,
+          phi::errors::InvalidArgument("Try allocate memory twice!"));
       return ptr;
     }
     CUDA_CALL(cudaMalloc(&ptr, bytes));
@@ -67,12 +73,14 @@ class CudaMem {
   void MemcpyFromHost(const void* src,
                       size_t bytes,
                       cudaStream_t stream = nullptr) {
-    CHECK_LE(bytes, bytes_) << "Too many data need copy";
+    PADDLE_ENFORCE_LE(
+        bytes, bytes_, phi::errors::InvalidArgument("Too many data need copy"));
     CUDA_CALL(cudaMemcpyAsync(ptr, src, bytes, cudaMemcpyHostToDevice, stream));
   }
 
   void MemcpyToHost(void* dst, size_t bytes, cudaStream_t stream = nullptr) {
-    CHECK_LE(bytes, bytes_) << "Too many data need copy";
+    PADDLE_ENFORCE_LE(
+        bytes, bytes_, phi::errors::InvalidArgument("Too many data need copy"));
     CUDA_CALL(cudaMemcpyAsync(dst, ptr, bytes, cudaMemcpyDeviceToHost, stream));
   }
 
diff --git a/paddle/cinn/common/graph_utils.cc b/paddle/cinn/common/graph_utils.cc
old mode 100755
new mode 100644
index b1110e8ca8aa0..b6223443b04fd
--- a/paddle/cinn/common/graph_utils.cc
+++ b/paddle/cinn/common/graph_utils.cc
@@ -23,7 +23,7 @@
 
 #include "paddle/cinn/common/common.h"
 #include "paddle/cinn/utils/dot_lang.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace common {
 
@@ -98,7 +98,10 @@ Graph::topological_order() const {
     queue.pop_front();
 
     for (auto &edge : top_node->outlinks()) {
-      CHECK_EQ(edge->source(), top_node);
+      PADDLE_ENFORCE_EQ(edge->source(),
+                        top_node,
+                        phi::errors::InvalidArgument(
+                            "The edge's source is not equal to the top node."));
       edge_order.push_back(edge.get());
       auto *sink = edge->sink();
       if ((--indegree[sink->id()]) == 0) {
@@ -107,9 +110,10 @@ Graph::topological_order() const {
     }
   }
 
-  CHECK_EQ(node_order.size(), nodes().size())
-      << "circle detected in the schedule graph:\n\n"
-      << Visualize();
+  PADDLE_ENFORCE_EQ(node_order.size(),
+                    nodes().size(),
+                    phi::errors::InvalidArgument(
+                        "The node_order size is not equal to the nodes size."));
 
   return std::make_tuple(node_order, edge_order);
 }
diff --git a/paddle/cinn/common/graph_utils.h b/paddle/cinn/common/graph_utils.h
index 9834b2368d460..55d12bcfd12ae 100644
--- a/paddle/cinn/common/graph_utils.h
+++ b/paddle/cinn/common/graph_utils.h
@@ -31,7 +31,7 @@
 #include "paddle/cinn/common/object.h"
 #include "paddle/cinn/common/shared.h"
 #include "paddle/cinn/common/type.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace common {
 
@@ -86,7 +86,8 @@ class GraphNode : public Object {
   std::tuple<EdgeT*, EdgeT*> LinkTo(GraphNode* other) {
     EdgeT *a, *b;
     CHECK(other);
-    CHECK_NE(other, this) << "Cannot link to itself";
+    PADDLE_ENFORCE_NE(
+        other, this, phi::errors::InvalidArgument("Cannot link to itself"));
     auto outlink_edge = make_shared<GraphEdge>(this, other, index_outlinks);
     auto inlink_edge =
         make_shared<GraphEdge>(this, other, other->index_inlinks);
@@ -127,7 +128,10 @@ class GraphNode : public Object {
         break;
       }
     }
-    CHECK_EQ(outlink_linked, inlink_linked);
+    PADDLE_ENFORCE_EQ(outlink_linked,
+                      inlink_linked,
+                      phi::errors::InvalidArgument(
+                          "The outlink_linked should same as inlink_linked."));
     if (outlink_linked)
       return;
     else
diff --git a/paddle/cinn/common/ir_util.cc b/paddle/cinn/common/ir_util.cc
index d326e652a7be7..c73091e8196be 100644
--- a/paddle/cinn/common/ir_util.cc
+++ b/paddle/cinn/common/ir_util.cc
@@ -21,7 +21,7 @@
 #include "paddle/cinn/ir/ir_mutator.h"
 #include "paddle/cinn/ir/ir_printer.h"
 #include "paddle/cinn/ir/op/ir_operators.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace common {
 
@@ -29,19 +29,34 @@ namespace {
 
 // ramp + scalar or broadcast
 Expr RampRelatedMul(ir::Ramp *ramp, Expr other) {
-  CHECK_EQ(other.type().ElementOf(), Int(32));
-  CHECK_EQ(ramp->base.type(), Int(32));
-  CHECK_EQ(ramp->stride.type(), Int(32));
+  PADDLE_ENFORCE_EQ(
+      other.type().ElementOf(),
+      Int(32),
+      phi::errors::InvalidArgument("The type of other should be int32."));
+  PADDLE_ENFORCE_EQ(
+      ramp->base.type(),
+      Int(32),
+      phi::errors::InvalidArgument("The type of ramp->base should be int32."));
+  PADDLE_ENFORCE_EQ(ramp->stride.type(),
+                    Int(32),
+                    phi::errors::InvalidArgument(
+                        "The type of ramp->stride should be int32."));
   auto *other_broadcast = other.As<ir::Broadcast>();
   if (other_broadcast) {
-    CHECK_EQ(ramp->lanes, other_broadcast->lanes);
+    PADDLE_ENFORCE_EQ(ramp->lanes,
+                      other_broadcast->lanes,
+                      phi::errors::InvalidArgument(
+                          "The lanes of ramp and other should be equal."));
     other = other_broadcast->value;
   }
   return ir::Ramp::Make(ramp->base * other, ramp->stride * other, ramp->lanes);
 }
 
 Expr RampRelatedMul(ir::Broadcast *broadcast, Expr other) {
-  CHECK_EQ(other.type().lanes(), 1);
+  PADDLE_ENFORCE_EQ(
+      other.type().lanes(),
+      1,
+      phi::errors::InvalidArgument("The lanes of other should be 1."));
   return ir::Broadcast::Make(broadcast->value * other, broadcast->lanes);
 }
 // ramp * ramp
@@ -51,17 +66,26 @@ Expr RampRelatedMul(ir::Ramp *ramp, ir::Ramp *other) {
 }
 // ramp + scalar
 Expr RampRelatedAdd(ir::Ramp *ramp, Expr other) {
-  CHECK_EQ(other.type().ElementOf(), Int(32));
+  PADDLE_ENFORCE_EQ(
+      other.type().ElementOf(),
+      Int(32),
+      phi::errors::InvalidArgument("The type of other should be int32."));
 
   auto *other_broadcast = other.As<ir::Broadcast>();
   if (other_broadcast) {
-    CHECK_EQ(ramp->lanes, other_broadcast->lanes);
+    PADDLE_ENFORCE_EQ(ramp->lanes,
+                      other_broadcast->lanes,
+                      phi::errors::InvalidArgument(
+                          "The lanes of ramp and other should be equal."));
     other = other_broadcast->value;
   }
   return ir::Ramp::Make(ramp->base + other, ramp->stride, ramp->lanes);
 }
 Expr RampRelatedAdd(ir::Broadcast *broadcast, Expr other) {
-  CHECK_EQ(other.type().lanes(), 1);
+  PADDLE_ENFORCE_EQ(
+      other.type().lanes(),
+      1,
+      phi::errors::InvalidArgument("The lanes of other should be 1."));
   return ir::Broadcast::Make(broadcast->value + other, broadcast->lanes);
 }
 // ramp + ramp
@@ -98,7 +122,11 @@ Expr RampRelatedAdd(Expr a, Expr b) {
   } else if (!a_broadcast && b_broadcast) {
     return RampRelatedAdd(b_broadcast, a);
   } else if (a_broadcast && b_broadcast) {
-    CHECK_EQ(a_broadcast->lanes, b_broadcast->lanes);
+    PADDLE_ENFORCE_EQ(
+        a_broadcast->lanes,
+        b_broadcast->lanes,
+        phi::errors::InvalidArgument(
+            "The lanes of a_broadcast and b_broadcast should be equal."));
     return ir::Broadcast::Make(a_broadcast->value + b_broadcast->value,
                                a_broadcast->lanes);
   } else {
@@ -125,7 +153,11 @@ Expr RampRelatedMul(Expr a, Expr b) {
   } else if (!a_broadcast && b_broadcast) {
     return RampRelatedMul(b_broadcast, a);
   } else if (a_broadcast && b_broadcast) {
-    CHECK_EQ(a_broadcast->lanes, b_broadcast->lanes);
+    PADDLE_ENFORCE_EQ(
+        a_broadcast->lanes,
+        b_broadcast->lanes,
+        phi::errors::InvalidArgument(
+            "The lanes of a_broadcast and b_broadcast should be equal."));
     return ir::Broadcast::Make(a_broadcast->value * b_broadcast->value,
                                a_broadcast->lanes);
   } else {
@@ -141,7 +173,11 @@ Expr IndiceToAbsOffset(const std::vector<Expr> &shape,
   VLOG(3) << "Begin IndiceToAbsOffset";
   VLOG(3) << "shape is : " << utils::Join(shape, ",");
   VLOG(3) << "indices is : " << utils::Join(indices, ",");
-  CHECK_LE(shape.size(), indices.size());
+  PADDLE_ENFORCE_LE(
+      shape.size(),
+      indices.size(),
+      phi::errors::InvalidArgument("The size of shape should be less than or "
+                                   "equal to the size of indices."));
   Expr res;
   ir::TryElevateInt32ToInt64(shape);
   for (int i = 0; i < shape.size(); i++) {
@@ -261,10 +297,11 @@ void CheckTensorUniqueInExpr(Expr expr) {
     if (!tensor_names.count(tp->name)) {
       tensor_names[tp->name] = tp;
     } else {
-      CHECK_EQ(tensor_names[tp->name], tp)
-          << "Found tensor not unique [" << tp->name
-          << "]\nThe original expression is \n"
-          << expr;
+      PADDLE_ENFORCE_EQ(
+          tensor_names[tp->name],
+          tp,
+          phi::errors::InvalidArgument(
+              "Found tensor not unique, The original express is %d .", expr));
     }
   }
 }
@@ -281,7 +318,11 @@ void CheckBufferUniqueInExpr(Expr expr) {
   absl::flat_hash_map<std::string, const ir::_Buffer_ *> buffer_name;
   auto check_buffer_uniq = [&](const ir::_Buffer_ *b) {
     if (buffer_name.count(b->name)) {
-      CHECK_EQ(buffer_name[b->name], b);
+      PADDLE_ENFORCE_EQ(
+          buffer_name[b->name],
+          b,
+          phi::errors::InvalidArgument(
+              "Found buffer not unique, The original express is %d .", expr));
     } else {
       buffer_name[b->name] = b->const_self();
     }
@@ -426,12 +467,18 @@ std::vector<Expr *> GetForloopStackToStore(Expr *expr,
 }
 
 Expr max(Expr a, Expr b) {
-  CHECK_EQ(a.type(), b.type());
+  PADDLE_ENFORCE_EQ(
+      a.type(),
+      b.type(),
+      phi::errors::InvalidArgument("The type of a and b should be equal."));
   return ir::Max::Make(a, b);
 }
 
 Expr min(Expr a, Expr b) {
-  CHECK_EQ(a.type(), b.type());
+  PADDLE_ENFORCE_EQ(
+      a.type(),
+      b.type(),
+      phi::errors::InvalidArgument("The type of a and b should be equal."));
   return ir::Min::Make(a, b);
 }
 
diff --git a/paddle/cinn/common/type.cc b/paddle/cinn/common/type.cc
index 41cfd9e638f90..5163d7b921d59 100644
--- a/paddle/cinn/common/type.cc
+++ b/paddle/cinn/common/type.cc
@@ -137,7 +137,10 @@ Type Type::ElementOf() const {
 }
 
 void Type::CheckTypeValid() const {
-  CHECK_NE(GetStorage().type_, type_t::Unk);
+  PADDLE_ENFORCE_NE(
+      GetStorage().type_,
+      type_t::Unk,
+      phi::errors::InvalidArgument("The type is not initialized."));
   if (GetStorage().type_ == type_t::Float && GetStorage().bits_ == 16) {
     CHECK(GetStorage().specific_type_ == specific_type_t::FP16 ||
           GetStorage().specific_type_ == specific_type_t::BF16)
diff --git a/paddle/cinn/common/union_find.h b/paddle/cinn/common/union_find.h
index a88f52dafe515..a76157e7f760e 100644
--- a/paddle/cinn/common/union_find.h
+++ b/paddle/cinn/common/union_find.h
@@ -26,7 +26,7 @@
 
 #include "paddle/cinn/common/object.h"
 #include "paddle/cinn/common/shared.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace common {
 
@@ -62,8 +62,11 @@ struct UnionFindNode : public Object {
 
   template <typename T>
   T* safe_as() {
-    CHECK_EQ(std::strcmp(T::__type_info__, type_info()), 0)
-        << "Want a " << T::__type_info__ << " but get a " << type_info();
+    PADDLE_ENFORCE_EQ(
+        std::strcmp(T::__type_info__, type_info()),
+        0,
+        phi::errors::InvalidArgument(
+            "Want a %d but get a %d", T::__type_info__, type_info()));
     return reinterpret_cast<T*>(this);
   }
 
diff --git a/paddle/cinn/hlir/dialect/operator/ir/manual_op.cc b/paddle/cinn/hlir/dialect/operator/ir/manual_op.cc
index 36fe9e340fcd9..5e7d3e6d876cf 100644
--- a/paddle/cinn/hlir/dialect/operator/ir/manual_op.cc
+++ b/paddle/cinn/hlir/dialect/operator/ir/manual_op.cc
@@ -324,12 +324,12 @@ void SplitOp::Build(pir::Builder& builder,             // NOLINT
 const char* GenerateShapeOp::attributes_name[attributes_num] = {
     "output_dim_exprs", "symbol_bindings"};
 
-void GenerateShapeOp::Build(
-    pir::Builder& builder,
-    pir::OperationArgument& argument,
-    const std::vector<pir::Value>& inputs,
-    const std::vector<pir::Attribute>& output_dim_exprs,
-    const GenerateShapeOp::SymbolBindings& symbol_bindings) {
+void GenerateShapeOp::Build(pir::Builder& builder,
+                            pir::OperationArgument& argument,
+                            const std::vector<pir::Value>& inputs,
+                            const std::vector<pir::Attribute>& output_dim_exprs,
+                            const SymbolBindings& symbol_bindings,
+                            const pir::Type& output_type) {
   if (inputs.empty()) {
     VLOG(3) << "GenerateShapeOp inputs is empty";
     for (const auto& attr : output_dim_exprs) {
@@ -344,13 +344,7 @@ void GenerateShapeOp::Build(
   argument.AddAttribute(
       "symbol_bindings",
       ConvertSymbolBindingsToAttribute(builder, symbol_bindings));
-  argument.AddOutputs({[&]() {
-    auto* ctx = pir::IrContext::Instance();
-    auto type = pir::Int64Type::get(ctx);
-    auto dim =
-        ::common::make_ddim({static_cast<int64_t>(output_dim_exprs.size())});
-    return DenseTensorType::get(ctx, type, dim);
-  }()});
+  argument.AddOutput(output_type);
   ::pir::PassStopGradientsDefaultly(argument);
 }
 
diff --git a/paddle/cinn/hlir/dialect/operator/ir/manual_op.h b/paddle/cinn/hlir/dialect/operator/ir/manual_op.h
index 1eddfaffd0df1..06f306a0e3623 100644
--- a/paddle/cinn/hlir/dialect/operator/ir/manual_op.h
+++ b/paddle/cinn/hlir/dialect/operator/ir/manual_op.h
@@ -168,7 +168,8 @@ class IR_API GenerateShapeOp
                     pir::OperationArgument &argument,  // NOLINT
                     const std::vector<pir::Value> &inputs,
                     const std::vector<pir::Attribute> &output_dim_exprs,
-                    const SymbolBindings &symbol_bindings);
+                    const SymbolBindings &symbol_bindings,
+                    const pir::Type &output_type);
 
   void VerifySig() {}
 
diff --git a/paddle/cinn/hlir/dialect/operator/ir/ops.yaml b/paddle/cinn/hlir/dialect/operator/ir/ops.yaml
index a8eac75248186..efbeaf298e7a0 100644
--- a/paddle/cinn/hlir/dialect/operator/ir/ops.yaml
+++ b/paddle/cinn/hlir/dialect/operator/ir/ops.yaml
@@ -81,9 +81,9 @@
 
 - op : reshape
   args : (Tensor x, int[] shape)
-  output : Tensor(out)
+  output : Tensor(out), Tensor(xshape)
   infer_meta :
-    func : ReshapeInferMeta
+    func : ReshapeWithXShapeInferMeta
   kernel :
     func : reshape
   interfaces : paddle::dialect::InferSymbolicShapeInterface
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/add_cinn_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/add_cinn_pass.cc
index 6aea2dc8b759b..c864410715531 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/add_cinn_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/add_cinn_pass.cc
@@ -57,6 +57,7 @@ COMMON_DECLARE_bool(print_ir);
 COMMON_DECLARE_bool(disable_dyshape_in_train);
 COMMON_DECLARE_bool(enable_cinn_accuracy_check);
 COMMON_DECLARE_bool(enable_fuse_parallel_matmul_pass);
+COMMON_DECLARE_bool(logging_pir_py_code_dump_symbolic_dims);
 PD_DECLARE_bool(group_schedule_tiling_first);
 
 namespace cinn::dialect::ir {
@@ -229,7 +230,7 @@ void ApplyCinnPass(::pir::Program* program,
                        CreatePassManager) {
   PirToPyCodeConverter(program)
       .file_name("original_programs.py")
-      .dump_symbolic_shape(false)
+      .dump_symbolic_shape(FLAGS_logging_pir_py_code_dump_symbolic_dims)
       .SaveIfFlagEnabled();
   ApplyPdToCinnPass(program, CreatePassManager);
   ApplyCinnPreprocessPass(program, CreatePassManager);
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_pass.cc
index 63d5b519ce887..ec82d41742a70 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_pass.cc
@@ -232,7 +232,7 @@ class BlockDimExprsAsserter {
     };
     std::vector<pir::Value> input_tensors{};
     std::vector<pir::Attribute> output_dim_expr_attrs{};
-    GenerateShapeOp::SymbolBindings symbol_bindings{};
+    SymbolBindings symbol_bindings{};
     bool success =
         MakeGenerateShapeOpAttribute(ir_ctx_,
                                      LocalDimExprs4Value,
@@ -242,14 +242,13 @@ class BlockDimExprsAsserter {
                                      &output_dim_expr_attrs,
                                      &symbol_bindings);
     if (!success) return std::nullopt;
-    auto out_shape_value =
-        builder_
-            .Build<cinn::dialect::GenerateShapeOp>(
-                input_tensors, output_dim_expr_attrs, symbol_bindings)
-            .out();
+    auto out_type = paddle::dialect::DenseTensorType::get(
+        builder_.ir_context(),
+        pir::Int64Type::get(builder_.ir_context()),
+        ::common::make_ddim({dim_exprs.size()}));
     return builder_
         .Build<cinn::dialect::GenerateShapeOp>(
-            input_tensors, output_dim_expr_attrs, symbol_bindings)
+            input_tensors, output_dim_expr_attrs, symbol_bindings, out_type)
         .out();
   }
 
@@ -298,8 +297,11 @@ class BlockDimExprsAsserter {
     PADDLE_ENFORCE_EQ(lhs_numel,
                       rhs_numel,
                       ::common::errors::InvalidArgument(
+                          "Check [%s id:%d] infer symbolic shape failed."
                           "The numel of lhs and rhs must be equal, but "
                           "received lhs's numel is [%d], rhs's numel is [%d]",
+                          op->name(),
+                          op->id(),
                           lhs_numel,
                           rhs_numel));
 
@@ -326,8 +328,8 @@ class BlockDimExprsAsserter {
             .out();
     auto assert_op = builder_.Build<paddle::dialect::AssertOp>(
         all_eq, assert_data, lhs_numel);
-    const std::string error_msg = "Check [" + op->name() + "_" +
-                                  std::to_string(op->id()) +
+    const std::string error_msg = "Check [" + op->name() +
+                                  " id:" + std::to_string(op->id()) +
                                   "] infer symbolic shape failed.";
     assert_op->set_attribute(
         paddle::dialect::AssertOp::ERROR_INFO_ATTR_NAME,
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/cinn_to_pd_util.cc b/paddle/cinn/hlir/dialect/operator/transforms/cinn_to_pd_util.cc
index 6281baeadbef2..ca422c1a593c8 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/cinn_to_pd_util.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/cinn_to_pd_util.cc
@@ -190,6 +190,15 @@ ::pir::Operation* ConvertConcatOp(::pir::Operation* op,
   return pd_op;
 }
 
+::pir::Operation* ConvertGenerateShapeOp(
+    ::pir::Operation* op,
+    ::pir::IrMapping& ir_mapping,  // NOLINT
+    ::pir::Builder& builder) {     // NOLINT
+  auto* new_op = op->Clone(ir_mapping, {true, true, true});
+  builder.Insert(new_op);
+  return new_op;
+}
+
 ::pir::Operation* ConvertScaleOp(::pir::Operation* op,
                                  ::pir::IrMapping& ir_mapping,        // NOLINT
                                  ::pir::PatternRewriter& rewriter) {  // NOLINT
@@ -404,6 +413,9 @@ REGISTER_TRANSFORM_RULES(concat_op,
                          cinn::dialect::ConcatOp::name(),
                          cinn::dialect::details::ConvertConcatOp);
 
+REGISTER_TRANSFORM_RULES(generate_shape_op,
+                         cinn::dialect::GenerateShapeOp::name(),
+                         cinn::dialect::details::ConvertGenerateShapeOp);
 REGISTER_TRANSFORM_RULES(scale_op,
                          cinn::dialect::ScaleOp::name(),
                          cinn::dialect::details::ConvertScaleOp);
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/dynamic_reshape_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/dynamic_reshape_pass.cc
index 17317924fb07e..0ffd284ac79f7 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/dynamic_reshape_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/dynamic_reshape_pass.cc
@@ -38,9 +38,10 @@ bool ReplaceOpWithReshapeOp(pir::Operation* op,
   std::vector<pir::Attribute> output_dim_expr_attrs{};
   GenerateShapeOp::SymbolBindings symbol_bindings{};
 
-  unsigned output_dim_idx = 0, input_dim_idx = 0;
   int64_t local_dim_expr_id = 0;
-  for (; output_dim_idx < output_shape.size(); ++output_dim_idx) {
+  for (unsigned output_dim_idx = 0, input_dim_idx = 0;
+       output_dim_idx < output_shape.size();
+       ++output_dim_idx) {
     const auto& dim_expr = output_shape.at(output_dim_idx);
     if (dim_expr.isa<int64_t>()) {
       output_dim_expr_attrs.emplace_back(
@@ -64,8 +65,16 @@ bool ReplaceOpWithReshapeOp(pir::Operation* op,
       }
     }
   }
+  auto out_type = paddle::dialect::DenseTensorType::get(
+      rewriter.ir_context(),
+      pir::Int64Type::get(rewriter.ir_context()),
+      ::common::make_ddim(
+          {static_cast<int64_t>(output_dim_expr_attrs.size())}));
   auto cinn_generate_shape = rewriter.Build<cinn::dialect::GenerateShapeOp>(
-      std::vector<pir::Value>{input}, output_dim_expr_attrs, symbol_bindings);
+      std::vector<pir::Value>{input},
+      output_dim_expr_attrs,
+      symbol_bindings,
+      out_type);
   auto pd_reshape = rewriter.Build<paddle::dialect::ReshapeOp>(
       op->operand_source(0), cinn_generate_shape.result(0));
 
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/fuse_shape_ops_into_generate_shape_op_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/fuse_shape_ops_into_generate_shape_op_pass.cc
index 0578c79b35a2b..473763bb4dcec 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/fuse_shape_ops_into_generate_shape_op_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/fuse_shape_ops_into_generate_shape_op_pass.cc
@@ -313,9 +313,18 @@ std::optional<pir::Value> GetOutOfRewrittenGenerateShapeOp(
                                               &output_dim_expr_attrs,
                                               &symbol_bindings);
   if (!success) return std::nullopt;
+  auto out_type = [&]() -> pir::Type {
+    if (shape.type().isa<paddle::dialect::DenseTensorType>()) {
+      return shape.type();
+    }
+    return paddle::dialect::DenseTensorType::get(
+        rewriter->ir_context(),
+        pir::Int64Type::get(rewriter->ir_context()),
+        ::common::make_ddim({output_dim_expr_attrs.size()}));
+  }();
   return rewriter
       ->Build<cinn::dialect::GenerateShapeOp>(
-          input_tensors, output_dim_expr_attrs, symbol_bindings)
+          input_tensors, output_dim_expr_attrs, symbol_bindings, out_type)
       .out();
 }
 
@@ -323,9 +332,8 @@ bool ReplaceShapeOpsToGenerateShape(
     pir::OpOperand shape_operand,
     pir::PatternRewriter* rewriter,
     pir::ShapeConstraintIRAnalysis* shape_analysis) {
-  if (shape_operand.source()
-          .defining_op()
-          ->isa<cinn::dialect::GenerateShapeOp>()) {
+  auto* shape_def_op = shape_operand.source().defining_op();
+  if (!shape_def_op || shape_def_op->isa<cinn::dialect::GenerateShapeOp>()) {
     return false;
   }
   auto ShapeOrDataDimExprs4Value =
@@ -379,6 +387,82 @@ class FuseShapeOpsIntoGenerateShapeOpPattern
   }
 };
 
+class FuseSingleElementShapeOpsIntoGenerateShapeOpPattern
+    : public pir::RewritePattern {
+ public:
+  explicit FuseSingleElementShapeOpsIntoGenerateShapeOpPattern(
+      pir::IrContext* context)
+      : pir::RewritePattern(MatchAnyOpTypeTag(),
+                            1 /*benefit*/,
+                            context,
+                            {} /*generated_names*/) {}
+
+  bool Match(pir::Operation* op) const override {
+    auto& shape_analysis =
+        pir::ShapeAnalysisManager::Instance().Get(op->GetParentProgram());
+    if (!IsSingleElementShapeOp(op, &shape_analysis)) return false;
+    if (op->isa<cinn::dialect::GenerateShapeOp>()) return false;
+
+    // all user op's output should has no data of shape expr
+    pir::Value output = op->result(0);
+    if (output.use_empty()) return false;
+    for (auto iter = output.use_begin(); iter != output.use_end(); ++iter) {
+      auto* user = iter->owner();
+      if (IsSingleElementShapeOp(user, &shape_analysis)) return false;
+      if (user->isa<cinn::dialect::GenerateShapeOp>()) return false;
+    }
+
+    return true;
+  }
+
+  void Rewrite(pir::Operation* op,
+               pir::PatternRewriter& rewriter) const override {
+    auto& shape_analysis =
+        pir::ShapeAnalysisManager::Instance().Get(op->GetParentProgram());
+
+    auto ShapeOrDataDimExprs4Value =
+        [&shape_analysis](
+            pir::Value value) -> const symbol::ShapeOrDataDimExprs& {
+      return shape_analysis.GetShapeOrDataForValue(value);
+    };
+    std::optional<pir::Value> opt_generated_shape =
+        GetOutOfRewrittenGenerateShapeOp(
+            op->result(0), &rewriter, ShapeOrDataDimExprs4Value);
+    if (!opt_generated_shape.has_value()) {
+      LOG(WARNING) << "Create GenerateShapeOp Failed.";
+      return;
+    }
+
+    rewriter.ReplaceAllUsesWith(op->result(0), opt_generated_shape.value());
+
+    if (op->use_empty()) {
+      rewriter.EraseOp(op);
+    }
+  }
+
+ private:
+  bool IsSingleElementShapeOp(
+      pir::Operation* op,
+      pir::ShapeConstraintIRAnalysis* shape_analysis) const {
+    if (op->num_operands() == 0) return false;
+    if (op->num_results() != 1) return false;
+
+    pir::Value output = op->result(0);
+    const auto& out_shape = shape_analysis->GetShapeOrDataForValue(output);
+    if (!out_shape.isa<symbol::TensorShapeOrDataDimExprs>()) return false;
+    if (!out_shape.data().has_value()) return false;
+
+    auto dtype =
+        output.type().dyn_cast<paddle::dialect::DenseTensorType>().dtype();
+    if (!dtype.isa<pir::Int32Type>() && !dtype.isa<pir::Int64Type>()) {
+      return false;
+    }
+
+    // Only process the op which output is a single element
+    return out_shape.data()->size() == 1;
+  }
+};
+
 class FuseShapeOpsIntoGenerateShapeOpPass : public pir::PatternRewritePass {
  public:
   FuseShapeOpsIntoGenerateShapeOpPass()
@@ -393,6 +477,7 @@ class FuseShapeOpsIntoGenerateShapeOpPass : public pir::PatternRewritePass {
         context);
     ps.Add<FuseShapeOpsIntoGenerateShapeOpPattern<paddle::dialect::SliceOp>>(
         context);
+    ps.Add<FuseSingleElementShapeOpsIntoGenerateShapeOpPattern>(context);
     return ps;
   }
 
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/generate_shape_util.cc b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/generate_shape_util.cc
index 30b470d42ca2a..f2afbae3d515d 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/generate_shape_util.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/generate_shape_util.cc
@@ -83,8 +83,10 @@ std::optional<pir::Value> InsertGenerateShapeOpToRunFirst(
                                    &symbol_bindings);
   if (success) {
     return builder
-        ->Build<cinn::dialect::GenerateShapeOp>(
-            minimal_inputs, output_dim_expr_attrs, symbol_bindings)
+        ->Build<cinn::dialect::GenerateShapeOp>(minimal_inputs,
+                                                output_dim_expr_attrs,
+                                                symbol_bindings,
+                                                value.type())
         .out();
   }
   return std::nullopt;
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/simplify_dim_expr_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/simplify_dim_expr_pass.cc
index 69723f8be0b86..86ae8d77d5296 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/simplify_dim_expr_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/simplify_dim_expr_pass.cc
@@ -101,6 +101,9 @@ void SimplifyDimExpr(pir::Operation* module_op) {
 
   VisitEachOp(module_op, [&](pir::Operation& op) {
     VisitEachValue(op, [&](pir::Value value) {
+      if (!value || !value.type()) {
+        return;
+      }
       const symbol::ShapeOrDataDimExprs& shape_or_data =
           shape_analysis->GetShapeOrDataForValue(value);
       VLOG(8) << op.name() << "     origin_shape_or_data: " << shape_or_data;
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/broadcast_with_cf.cc b/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/broadcast_with_cf.cc
index 8f0bab178d75c..7beec47823a4d 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/broadcast_with_cf.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/broadcast_with_cf.cc
@@ -233,17 +233,24 @@ std::tuple<pir::Value, pir::Value, pir::Value> BroadcastableToCondValue(
                                                   &rhs_symbol_bindings);
   CHECK(success);
 
+  auto out_type = paddle::dialect::DenseTensorType::get(
+      builder.ir_context(),
+      pir::Int64Type::get(builder.ir_context()),
+      ::common::make_ddim({1}));
+
   auto lhs_value =
       builder
           .Build<cinn::dialect::GenerateShapeOp>(lhs_minimal_inputs,
                                                  lhs_output_dim_expr_attrs,
-                                                 lhs_symbol_bindings)
+                                                 lhs_symbol_bindings,
+                                                 out_type)
           .out();
   auto rhs_value =
       builder
           .Build<cinn::dialect::GenerateShapeOp>(rhs_minimal_inputs,
                                                  rhs_output_dim_expr_attrs,
-                                                 rhs_symbol_bindings)
+                                                 rhs_symbol_bindings,
+                                                 out_type)
           .out();
 
   auto const_one = builder
@@ -435,9 +442,11 @@ std::shared_ptr<BroadcastTree> ConstructBroadcastTree(
     const cinn::common::BroadcastLeaf& leaves) {
   VLOG(6) << "before constructed. broadcast-leaf: \n"
           << ToTxtString(cinn::common::BroadcastTree(leaves));
+  int num_of_leaves = 0;
   auto broadcast_tree = std::make_shared<cinn::common::BroadcastTree>(
-      cinn::common::ConstructBroadcastTree(
-          cinn::common::BroadcastLeaf(leaves)));
+      cinn::common::ConstructBroadcastTree(cinn::common::BroadcastLeaf(leaves),
+                                           &num_of_leaves));
+  VLOG(4) << "num of broadcast tree leaves:" << num_of_leaves;
   VLOG(4) << "broadcast-tree: \n" << ToTxtString(*broadcast_tree);
   return broadcast_tree;
 }
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/utils.cc b/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/utils.cc
index a36c208f0c96c..c2604697d68af 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/utils.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/utils.cc
@@ -110,23 +110,26 @@ OpLoweringGroupPtr BuildOpLoweringGroup(pir::Operation* fusion_op_ptr) {
                           : group_op_kind;
     }
   }
-
-  auto group = std::make_shared<OpLoweringGroup>(ops);
-
-  if (fusion_op.attributes().count("group_info")) {
-    auto attr = fusion_op.attribute("group_info")
-                    .dyn_cast<cinn::dialect::GroupInfoAttribute>()
-                    .data();
-
-    group_op_kind =
-        static_cast<int>(attr.op_pattern_kind) > static_cast<int>(group_op_kind)
-            ? attr.op_pattern_kind
-            : group_op_kind;
-    group->set_loop_ranges(attr.loop_ranges);
-    group->set_loop_ranges_expr(attr.loop_ranges_expr);
-    group->set_reduce_axis(attr.reduce_axis);
-    group->set_alignment_schedule_info(attr.alignment_schedule_info);
-  }
+  PADDLE_ENFORCE_GT(fusion_op.attributes().count("group_info"),
+                    0UL,
+                    phi::errors::InvalidArgument(
+                        "fusion_op should have group_info attribute."));
+
+  const auto attr = fusion_op.attribute("group_info")
+                        .dyn_cast<cinn::dialect::GroupInfoAttribute>()
+                        .data();
+
+  const auto& fn_name = attr.fn_name;
+  auto group = std::make_shared<OpLoweringGroup>(ops, fn_name);
+
+  group_op_kind =
+      static_cast<int>(attr.op_pattern_kind) > static_cast<int>(group_op_kind)
+          ? attr.op_pattern_kind
+          : group_op_kind;
+  group->set_loop_ranges(attr.loop_ranges);
+  group->set_loop_ranges_expr(attr.loop_ranges_expr);
+  group->set_reduce_axis(attr.reduce_axis);
+  group->set_alignment_schedule_info(attr.alignment_schedule_info);
   group->set_op_pattern_kind(group_op_kind);
 
   // Rebuild output_ops and input_ops of the group
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc
index 648b3af363241..d4229ea9093bc 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc
@@ -33,116 +33,128 @@ namespace dialect {
 namespace ir {
 using CompatibleInfo = cinn::hlir::framework::pir::CompatibleInfo;
 
-class SumOpPattern : public paddle::drr::DrrPatternBase {
- public:
-  std::string name() const override { return "SumOpPattern"; }
-
-  void operator()(paddle::drr::DrrPatternContext *ctx) const override {
-    // Source Pattern
-    paddle::drr::SourcePattern pattern = ctx->SourcePattern();
-    const auto &full_int_array =
-        pattern.Op(paddle::dialect::FullIntArrayOp::name(),
-                   {{"value", pattern.Attr("axis_info")},
-                    {"dtype", pattern.Attr("dtype_2")},
-                    {"place", pattern.Attr("place_2")}});
-
-    const auto &sum = pattern.Op(paddle::dialect::SumOp::name(),
-                                 {{"dtype", pattern.Attr("dtype")},
-                                  {"keepdim", pattern.Attr("keep_dim")}});
-    pattern.Tensor("ret") = sum(pattern.Tensor("arg0"), full_int_array());
-
-    // Result patterns
-    paddle::drr::ResultPattern res = pattern.ResultPattern();
-    const auto &cinn_reduce_sum =
-        res.Op(cinn::dialect::ReduceSumOp::name(),
-               {{"dim", pattern.Attr("axis_info")},
-                {"dtype", pattern.Attr("dtype")},
-                {"keep_dim", pattern.Attr("keep_dim")}});
-    res.Tensor("ret") = cinn_reduce_sum(res.Tensor("arg0"));
+namespace {
+
+template <typename T = int>
+std::vector<T> GetVectorFromIntArrayAttribute(
+    const pir::ArrayAttribute &array_attr) {
+  const auto &vector_attr = array_attr.AsVector();
+
+  std::vector<T> result;
+  if (vector_attr.size() > 0) {
+    PADDLE_ENFORCE_EQ(vector_attr[0].isa<::pir::Int64Attribute>(),
+                      true,
+                      phi::errors::Unimplemented(
+                          "the 0th elementwise MUST be ir::Int64Attribute"));
+    for (size_t i = 0; i < vector_attr.size(); ++i) {
+      result.push_back(vector_attr[i].dyn_cast<::pir::Int64Attribute>().data());
+    }
   }
-};
+  return result;
+}
 
-class MaxOpPattern : public paddle::drr::DrrPatternBase {
- public:
-  std::string name() const override { return "MaxOpPattern"; }
+}  // namespace
 
-  void operator()(paddle::drr::DrrPatternContext *ctx) const override {
-    // Source Pattern
-    paddle::drr::SourcePattern pattern = ctx->SourcePattern();
-    const auto &full_int_array =
-        pattern.Op(paddle::dialect::FullIntArrayOp::name(),
-                   {{"value", pattern.Attr("axis_info")},
-                    {"dtype", pattern.Attr("dtype_2")},
-                    {"place", pattern.Attr("place_2")}});
+class SumOpPattern : public pir::OpRewritePattern<paddle::dialect::SumOp> {
+ public:
+  using pir::OpRewritePattern<paddle::dialect::SumOp>::OpRewritePattern;
 
-    const auto &pd_max = pattern.Op(paddle::dialect::MaxOp::name(),
-                                    {{"keepdim", pattern.Attr("keep_dim")}});
-    pattern.Tensor("ret") = pd_max(pattern.Tensor("arg0"), full_int_array());
+  bool Match(paddle::dialect::SumOp op) const override {
+    if (CompatibleInfo::IsDeniedForCinn(*op.operation())) return false;
+    auto *axes_op = op->operand_source(1).defining_op();
+    return axes_op && axes_op->isa<paddle::dialect::FullIntArrayOp>();
+  }
 
-    // Result patterns
-    paddle::drr::ResultPattern res = pattern.ResultPattern();
-    const auto &cinn_reduce_max =
-        res.Op(cinn::dialect::ReduceMaxOp::name(),
-               {{"dim", pattern.Attr("axis_info")},
-                {"keep_dim", pattern.Attr("keep_dim")}});
-    res.Tensor("ret") = cinn_reduce_max(res.Tensor("arg0"));
+  void Rewrite(paddle::dialect::SumOp op,
+               pir::PatternRewriter &rewriter) const override {
+    auto *axes_op = op->operand_source(1).defining_op();
+    auto full_int_array_op =
+        axes_op->dyn_cast<paddle::dialect::FullIntArrayOp>();
+
+    // get attribute value from full_int_array op
+    const std::vector<int64_t> axis = GetVectorFromIntArrayAttribute<int64_t>(
+        full_int_array_op.attribute("value").dyn_cast<pir::ArrayAttribute>());
+    const bool keep_dim =
+        op.attribute("keepdim").dyn_cast<::pir::BoolAttribute>().data();
+    const auto &dtype = op.attribute("dtype")
+                            .dyn_cast<paddle::dialect::DataTypeAttribute>()
+                            .data();
+
+    auto cinn_reduce = rewriter.Build<cinn::dialect::ReduceSumOp>(
+        op->operand_source(0), axis, keep_dim, dtype);
+    rewriter.ReplaceAllUsesWith(op.result(0), cinn_reduce.result(0));
+    rewriter.EraseOp(op);
+    if (full_int_array_op->use_empty()) {
+      rewriter.EraseOp(full_int_array_op);
+    }
   }
 };
 
-class MinOpPattern : public paddle::drr::DrrPatternBase {
+template <typename SOURCE_OP, typename TARGET_OP>
+class ReduceMinMaxOpPattern : public pir::OpRewritePattern<SOURCE_OP> {
  public:
-  std::string name() const override { return "MinOpPattern"; }
+  using pir::OpRewritePattern<SOURCE_OP>::OpRewritePattern;
 
-  void operator()(paddle::drr::DrrPatternContext *ctx) const override {
-    // Source Pattern
-    paddle::drr::SourcePattern pattern = ctx->SourcePattern();
-    const auto &full_int_array =
-        pattern.Op(paddle::dialect::FullIntArrayOp::name(),
-                   {{"value", pattern.Attr("axis_info")},
-                    {"dtype", pattern.Attr("dtype_2")},
-                    {"place", pattern.Attr("place_2")}});
-
-    const auto &pd_max = pattern.Op(paddle::dialect::MinOp::name(),
-                                    {{"keepdim", pattern.Attr("keep_dim")}});
-    pattern.Tensor("ret") = pd_max(pattern.Tensor("arg0"), full_int_array());
+  bool Match(SOURCE_OP op) const override {
+    if (CompatibleInfo::IsDeniedForCinn(*op.operation())) return false;
+    auto *axes_op = op->operand_source(1).defining_op();
+    return axes_op && axes_op->template isa<paddle::dialect::FullIntArrayOp>();
+  }
 
-    // Result patterns
-    paddle::drr::ResultPattern res = pattern.ResultPattern();
-    const auto &cinn_reduce_max =
-        res.Op(cinn::dialect::ReduceMinOp::name(),
-               {{"dim", pattern.Attr("axis_info")},
-                {"keep_dim", pattern.Attr("keep_dim")}});
-    res.Tensor("ret") = cinn_reduce_max(res.Tensor("arg0"));
+  void Rewrite(SOURCE_OP op, pir::PatternRewriter &rewriter) const override {
+    auto *axes_op = op->operand_source(1).defining_op();
+    auto full_int_array_op =
+        axes_op->template dyn_cast<paddle::dialect::FullIntArrayOp>();
+
+    // get attribute value from full_int_array op
+    const std::vector<int64_t> axis = GetVectorFromIntArrayAttribute<int64_t>(
+        full_int_array_op.attribute("value")
+            .template dyn_cast<pir::ArrayAttribute>());
+    const bool keep_dim = op.attribute("keepdim")
+                              .template dyn_cast<::pir::BoolAttribute>()
+                              .data();
+
+    auto cinn_reduce =
+        rewriter.Build<TARGET_OP>(op->operand_source(0), axis, keep_dim);
+    rewriter.ReplaceAllUsesWith(op.result(0), cinn_reduce.result(0));
+    rewriter.EraseOp(op);
+    if (full_int_array_op->use_empty()) {
+      rewriter.EraseOp(full_int_array_op);
+    }
   }
 };
 
-class ProdOpPattern : public paddle::drr::DrrPatternBase {
+class ProdOpPattern : public pir::OpRewritePattern<paddle::dialect::ProdOp> {
  public:
-  std::string name() const override { return "ProdOpPattern"; }
+  using pir::OpRewritePattern<paddle::dialect::ProdOp>::OpRewritePattern;
 
-  void operator()(paddle::drr::DrrPatternContext *ctx) const override {
-    // Source Pattern
-    paddle::drr::SourcePattern pattern = ctx->SourcePattern();
-    const auto &full_int_array =
-        pattern.Op(paddle::dialect::FullIntArrayOp::name(),
-                   {{"value", pattern.Attr("axis_info")},
-                    {"dtype", pattern.Attr("dtype_2")},
-                    {"place", pattern.Attr("place_2")}});
-
-    const auto &pd_max =
-        pattern.Op(paddle::dialect::ProdOp::name(),
-                   {{"keep_dim", pattern.Attr("keep_dim")},
-                    {"reduce_all", pattern.Attr("reduce_all")}});
-    pattern.Tensor("ret") = pd_max(pattern.Tensor("arg0"), full_int_array());
+  bool Match(paddle::dialect::ProdOp op) const override {
+    if (CompatibleInfo::IsDeniedForCinn(*op.operation())) return false;
+    auto *axes_op = op->operand_source(1).defining_op();
+    return axes_op && axes_op->isa<paddle::dialect::FullIntArrayOp>();
+  }
 
-    // Result patterns
-    paddle::drr::ResultPattern res = pattern.ResultPattern();
-    const auto &cinn_reduce_max =
-        res.Op(cinn::dialect::ReduceProdOp::name(),
-               {{"dim", pattern.Attr("axis_info")},
-                {"keep_dim", pattern.Attr("keep_dim")},
-                {"reduce_all", pattern.Attr("reduce_all")}});
-    res.Tensor("ret") = cinn_reduce_max(res.Tensor("arg0"));
+  void Rewrite(paddle::dialect::ProdOp op,
+               pir::PatternRewriter &rewriter) const override {
+    auto *axes_op = op->operand_source(1).defining_op();
+    auto full_int_array_op =
+        axes_op->dyn_cast<paddle::dialect::FullIntArrayOp>();
+
+    // get attribute value from full_int_array op
+    const std::vector<int64_t> axis = GetVectorFromIntArrayAttribute<int64_t>(
+        full_int_array_op.attribute("value").dyn_cast<pir::ArrayAttribute>());
+    const bool keep_dim =
+        op.attribute("keep_dim").dyn_cast<::pir::BoolAttribute>().data();
+    const bool reduce_all =
+        op.attribute("reduce_all").dyn_cast<::pir::BoolAttribute>().data();
+
+    auto cinn_reduce = rewriter.Build<cinn::dialect::ReduceProdOp>(
+        op->operand_source(0), axis, keep_dim, reduce_all);
+    rewriter.ReplaceAllUsesWith(op.result(0), cinn_reduce.result(0));
+    rewriter.EraseOp(op);
+    if (full_int_array_op->use_empty()) {
+      rewriter.EraseOp(full_int_array_op);
+    }
   }
 };
 
@@ -238,6 +250,7 @@ class ReshapeOpPattern
     auto cinn_reshape = rewriter.Build<cinn::dialect::ReshapeOp>(
         op->operand_source(0), vec_out_shape);
     rewriter.ReplaceAllUsesWith(op.result(0), cinn_reshape.result(0));
+    rewriter.ReplaceAllUsesWith(op.result(1), cinn_reshape.result(1));
     rewriter.EraseOp(op);
   }
 };
@@ -882,6 +895,7 @@ class SqueezeOpPattern
           op->operand_source(0), output_shape);
 
       rewriter.ReplaceAllUsesWith(op.result(0), cinn_reshape.result(0));
+      rewriter.ReplaceAllUsesWith(op.result(1), cinn_reshape.result(1));
 
       rewriter.EraseOp(op);
 
@@ -929,6 +943,7 @@ class UnsqueezeOpPattern
           op->operand_source(0), output_shape);
 
       rewriter.ReplaceAllUsesWith(op.result(0), cinn_reshape.result(0));
+      rewriter.ReplaceAllUsesWith(op.result(1), cinn_reshape.result(1));
 
       rewriter.EraseOp(op);
 
@@ -1023,6 +1038,7 @@ class FlattenOpPattern
     reshape_op.result(0).set_type(op.result(0).type());
 
     rewriter.ReplaceAllUsesWith(op.result(0), reshape_op.result(0));
+    rewriter.ReplaceAllUsesWith(op.result(1), reshape_op.result(1));
 
     rewriter.EraseOp(op);
   }
@@ -1117,10 +1133,12 @@ pir::RewritePatternSet PdOpToCinnOpPass::InitializePatterns(
   pir::RewritePatternSet ps(context);
   ps.Add<ScaleOpPattern>(
       context);  // NOTE, scale op pattern should before AddBroadcastTo
-  ps.Add(paddle::drr::Create<SumOpPattern>(context));
-  ps.Add(paddle::drr::Create<MaxOpPattern>(context));
-  ps.Add(paddle::drr::Create<MinOpPattern>(context));
-  ps.Add(paddle::drr::Create<ProdOpPattern>(context));
+  ps.Add<SumOpPattern>(context);
+  ps.Add<ReduceMinMaxOpPattern<paddle::dialect::MinOp,
+                               cinn::dialect::ReduceMinOp>>(context);
+  ps.Add<ReduceMinMaxOpPattern<paddle::dialect::MaxOp,
+                               cinn::dialect::ReduceMaxOp>>(context);
+  ps.Add<ProdOpPattern>(context);
   ps.Add<ReshapeOpPattern>(context);
   ps.Add<PowOpPattern>(context);
   ps.Add<ConcatOpPattern>(context);
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/pir_to_py_code_converter.cc b/paddle/cinn/hlir/dialect/operator/transforms/pir_to_py_code_converter.cc
index 74f3e4b4f200d..162d33a20ee54 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/pir_to_py_code_converter.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/pir_to_py_code_converter.cc
@@ -225,10 +225,23 @@ struct PirToPyCodeConverterHelper {
 
   std::vector<pir::Value> GetFreeVars(const pir::Block& block) {
     std::vector<pir::Value> inputs;
+    const auto IsBlockPositionalArg = [&](pir::Value value) {
+      const auto& args = block.args();
+      return std::find(args.begin(), args.end(), value) != args.end();
+    };
+    const auto IsBlockKeywardArg = [&](pir::Value value) {
+      const auto& kwargs = block.kwargs();
+      for (const auto& [_, kwarg] : kwargs) {
+        if (kwarg == value) return true;
+      }
+      return false;
+    };
     for (const auto& value : GetUsedExternalValue(block)) {
       if (!value) continue;
       if (std::find(inputs.begin(), inputs.end(), value) != inputs.end())
         continue;
+      if (IsBlockPositionalArg(value)) continue;
+      if (IsBlockKeywardArg(value)) continue;
       inputs.push_back(value);
     }
     return inputs;
@@ -330,6 +343,9 @@ struct PirToPyCodeConverterHelper {
           "):");
       IStrings return_lambda{ret_lambda_declare};
       PushBackIndented(&return_lambda, block_body);
+      if (block_body.empty()) {
+        return_lambda.push_back(Indent("pass"));
+      }
       return return_lambda;
     };
     std::string free_vars_as_args = ConvertFreeVarsAsArgs(block);
@@ -866,27 +882,52 @@ struct PirToPyCodeConverterHelper {
   }
 
   std::string ConvertInputTypes(const pir::Operation* op) {
-    std::stringstream ss;
-    ss << "[";
-    for (int i = 0; i < op->num_operands(); ++i) {
-      if (i > 0) {
-        ss << ", ";
+    const auto& VisitValue = [&](const auto& DoEachValue) {
+      for (int i = 0; i < op->num_operands(); ++i) {
+        DoEachValue(op->operand_source(i));
       }
-      ss << ConvertType(op->operand_source(i).type());
-    }
-    ss << "]";
-    return ss.str();
+    };
+    return ConvertValueTypes(VisitValue);
+  }
+
+  std::string ConvertBlockArgTypes(const pir::Block& block) {
+    const auto& VisitValue = [&](const auto& DoEachValue) {
+      for (const auto& arg : block.args()) {
+        DoEachValue(arg);
+      }
+    };
+    return ConvertValueTypes(VisitValue);
+  }
+
+  std::string ConvertBlockKwArgTypes(const pir::Block& block) {
+    const auto& VisitValue = [&](const auto& DoEachValue) {
+      for (const auto& [_, arg] : block.kwargs()) {
+        DoEachValue(arg);
+      }
+    };
+    return ConvertValueTypes(VisitValue);
   }
 
   std::string ConvertOutputTypes(const pir::Operation* op) {
+    const auto& VisitValue = [&](const auto& DoEachValue) {
+      for (int i = 0; i < op->num_results(); ++i) {
+        DoEachValue(op->result(i));
+      }
+    };
+    return ConvertValueTypes(VisitValue);
+  }
+
+  template <typename VisitValueT>
+  std::string ConvertValueTypes(const VisitValueT& VisitValue) {
     std::stringstream ss;
     ss << "[";
-    for (int i = 0; i < op->num_results(); ++i) {
-      if (i > 0) {
+    int i = 0;
+    VisitValue([&](pir::Value value) {
+      if (i++ > 0) {
         ss << ", ";
       }
-      ss << ConvertType(op->result(i).type());
-    }
+      ss << ConvertType(value.type());
+    });
     ss << "]";
     return ss.str();
   }
@@ -1098,7 +1139,45 @@ struct PirToPyCodeConverterHelper {
         }
         ss << "]";
       }
-      ss << "]";
+      ss << "], ";
+    }
+    {
+      int i = 0;
+      ss << "block_positional_arg_types=[";
+      for (const auto& region : *op) {
+        if (i++ > 0) {
+          ss << ",";
+        }
+        int j = 0;
+        ss << "[";
+        for (const auto& block : region) {
+          if (j++ > 0) {
+            ss << ",";
+          }
+          ss << ConvertBlockArgTypes(block);
+        }
+        ss << "]";
+      }
+      ss << "], ";
+    }
+    {
+      int i = 0;
+      ss << "block_keyword_arg_types=[";
+      for (const auto& region : *op) {
+        if (i++ > 0) {
+          ss << ",";
+        }
+        int j = 0;
+        ss << "[";
+        for (const auto& block : region) {
+          if (j++ > 0) {
+            ss << ",";
+          }
+          ss << ConvertBlockKwArgTypes(block);
+        }
+        ss << "]";
+      }
+      ss << "], ";
     }
     return ss.str();
   }
@@ -1138,18 +1217,10 @@ struct PirToPyCodeConverterHelper {
 
   std::string GetPyClassName() {
     std::ostringstream ss;
-    ss << "PirProgram_" << RandomInt();
+    ss << "PirProgram_" << program_->id();
     return ss.str();
   }
 
-  int64_t RandomInt() {
-    std::random_device rd{};
-    std::mt19937_64 gen(rd());
-    std::uniform_int_distribution<int64_t> dis(
-        0, std::numeric_limits<int64_t>::max());
-    return dis(gen);
-  }
-
   std::string ConvertIStringsToString(const IStrings& istrings) {
     std::stringstream ss;
     for (const auto& istring : istrings) {
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/split_generate_shape_into_shape_ops_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/split_generate_shape_into_shape_ops_pass.cc
index 4dd7e3ecf3e7d..98a8ff2e7ec3e 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/split_generate_shape_into_shape_ops_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/split_generate_shape_into_shape_ops_pass.cc
@@ -136,10 +136,10 @@ struct CachedDimExprToValueConverter {
           ->Build<paddle::dialect::FlattenOp>(value, 0, dims.size() - 1)
           .out();
     };
-    if (tensor_dim.value.type()
-            .dyn_cast<paddle::dialect::DenseTensorType>()
-            .dims()
-            .size() == 0) {
+    const auto& ddim = tensor_dim.value.type()
+                           .dyn_cast<paddle::dialect::DenseTensorType>()
+                           .dims();
+    if (ddim.size() == 0 || (ddim.size() == 1 && ddim[0] == 1)) {
       return CastToInt64IfNeed(tensor_dim.value);
     }
     return CastToInt64IfNeed(rewriter
diff --git a/paddle/cinn/hlir/framework/op_lowering_impl_base.h b/paddle/cinn/hlir/framework/op_lowering_impl_base.h
index 4d5284f22f6ed..3711f102dc2e8 100644
--- a/paddle/cinn/hlir/framework/op_lowering_impl_base.h
+++ b/paddle/cinn/hlir/framework/op_lowering_impl_base.h
@@ -31,6 +31,8 @@ struct BucketLoweredFuncsWrapper {
   std::vector<std::pair<ir::SymbolicPredicate, ir::LoweredFunc>>
       predicate2funcs;
   ir::LoweredFunc infer_shape_func;
+  std::vector<std::pair<ir::SymbolicPredicate, ir::LoweredFunc>>
+      predicate2funcsCX86;
 };
 
 template <typename T>
diff --git a/paddle/cinn/hlir/framework/pir/compilation_cache.cc b/paddle/cinn/hlir/framework/pir/compilation_cache.cc
index 1c5322c38866e..86f65bfb5c8db 100644
--- a/paddle/cinn/hlir/framework/pir/compilation_cache.cc
+++ b/paddle/cinn/hlir/framework/pir/compilation_cache.cc
@@ -37,11 +37,22 @@ void* BackendResource::GetInferFuncPtr() const {
   return ptr;
 }
 
+void* BackendResource::GetCX86HostFuncPtr() const {
+  VLOG(4) << "Lookup kernel name: " << host_fn_name_ + "_CX86";
+  void* ptr = backend_compiler_->Lookup(host_fn_name_ + "_CX86");
+  PADDLE_ENFORCE_NOT_NULL(
+      ptr,
+      ::common::errors::InvalidArgument("Can't find kernel function %s",
+                                        host_fn_name_ + "_CX86"));
+  return ptr;
+}
+
 pir::CINNKernelInfo BackendResource::GenerateKernelInfo() const {
   pir::CINNKernelInfo kernel_info;
   kernel_info.fn_name = host_fn_name_;
   kernel_info.fn_ptr = GetHostFuncPtr();
   kernel_info.infer_shape_fn_ptr = GetInferFuncPtr();
+  kernel_info.CX86_fn_ptr = GetCX86HostFuncPtr();
   kernel_info.int_args_map = GetIntArgsMap();
   return kernel_info;
 }
diff --git a/paddle/cinn/hlir/framework/pir/compilation_cache.h b/paddle/cinn/hlir/framework/pir/compilation_cache.h
index 0294755d399ef..f0f6c53380395 100644
--- a/paddle/cinn/hlir/framework/pir/compilation_cache.h
+++ b/paddle/cinn/hlir/framework/pir/compilation_cache.h
@@ -41,6 +41,7 @@ class BackendResource final {
 
   void* GetHostFuncPtr() const;
   void* GetInferFuncPtr() const;
+  void* GetCX86HostFuncPtr() const;
   const std::map<int, CINNKernelInfo::ArgDimIdx>& GetIntArgsMap() const {
     return int_args_map_;
   }
diff --git a/paddle/cinn/hlir/framework/pir/compilation_task.cc b/paddle/cinn/hlir/framework/pir/compilation_task.cc
index 1304979d14a61..39ddcf8291306 100644
--- a/paddle/cinn/hlir/framework/pir/compilation_task.cc
+++ b/paddle/cinn/hlir/framework/pir/compilation_task.cc
@@ -29,6 +29,11 @@ void GroupCompilationContext::SetLoweredFuncs(
     predicates_.push_back(std::move(predicate2func.first));
     lowered_funcs_.push_back(std::move(predicate2func.second));
   }
+  for (std::pair<ir::SymbolicPredicate, ir::LoweredFunc>& predicate2func :
+       funcs.predicate2funcsCX86) {
+    CX86_predicates_.push_back(std::move(predicate2func.first));
+    CX86_lowered_funcs_.push_back(std::move(predicate2func.second));
+  }
   infer_shape_lowered_func_ = std::move(funcs.infer_shape_func);
 }
 
@@ -73,11 +78,24 @@ std::shared_ptr<pir::CompilationResult> CompilationTask::CodegenAndJit() {
   }
   builder.SetInferShapeFunc(context_->infer_shape_lowered_func_);
   ir::Module ir_module = builder.Build();
-  return BuildPirCINNKernelInfo(ir_module);
+
+  ir::Module::Builder builder_CX86(cinn::common::UniqName("module"),
+                                   common::DefaultHostTarget());
+  CHECK_EQ(context_->CX86_predicates_.size(),
+           context_->CX86_lowered_funcs_.size());
+  for (const ir::Expr& predicate : context_->CX86_predicates_) {
+    builder_CX86.AddPredicate(predicate);
+  }
+  for (const ir::LoweredFunc& func : context_->CX86_lowered_funcs_) {
+    builder_CX86.AddFunction(func);
+  }
+  ir::Module ir_moduleCX86 = builder_CX86.Build();
+
+  return BuildPirCINNKernelInfo(ir_module, ir_moduleCX86);
 }
 
 std::shared_ptr<pir::CompilationResult> CompilationTask::BuildPirCINNKernelInfo(
-    const ir::Module& module) {
+    const ir::Module& module, const ir::Module& CX86module) {
   auto compilation_result =
       std::make_shared<pir::CompilationResult>(context_->target_);
   auto backend_resource = std::make_shared<pir::BackendResource>(
@@ -86,7 +104,8 @@ std::shared_ptr<pir::CompilationResult> CompilationTask::BuildPirCINNKernelInfo(
       context_->group_->FuncName() + "_infer_shape",
       context_->group_->int_args_map());
   VLOG(5) << "Start to compile module into cuda kernel...";
-  backend_resource->GetBackendCompiler()->Build(module, "");
+  backend_resource->GetBackendCompiler()->Build(module, "", false);
+  backend_resource->GetBackendCompiler()->AppendCX86(CX86module);
   compilation_result->SetBackendResource(backend_resource);
   VLOG(5) << "End to compile module into cuda kernel.";
   return compilation_result;
diff --git a/paddle/cinn/hlir/framework/pir/compilation_task.h b/paddle/cinn/hlir/framework/pir/compilation_task.h
index d104d264b6852..1ed3e2d5e6217 100644
--- a/paddle/cinn/hlir/framework/pir/compilation_task.h
+++ b/paddle/cinn/hlir/framework/pir/compilation_task.h
@@ -42,6 +42,8 @@ class GroupCompilationContext {
   const pir::OpLoweringGroupPtr& group_;
   std::vector<ir::SymbolicPredicate> predicates_;
   std::vector<ir::LoweredFunc> lowered_funcs_;
+  std::vector<ir::SymbolicPredicate> CX86_predicates_;
+  std::vector<ir::LoweredFunc> CX86_lowered_funcs_;
   ir::LoweredFunc infer_shape_lowered_func_;
 };
 
@@ -56,7 +58,7 @@ class CompilationTask {
   void Lowering();
   std::shared_ptr<pir::CompilationResult> CodegenAndJit();
   std::shared_ptr<pir::CompilationResult> BuildPirCINNKernelInfo(
-      const ir::Module& module);
+      const ir::Module& module, const ir::Module& CX86module);
 
   GroupCompilationContext* context_;
 };
diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_group.cc b/paddle/cinn/hlir/framework/pir/op_lowering_group.cc
index e5187f47ab471..e23ec953431c0 100644
--- a/paddle/cinn/hlir/framework/pir/op_lowering_group.cc
+++ b/paddle/cinn/hlir/framework/pir/op_lowering_group.cc
@@ -145,8 +145,9 @@ std::shared_ptr<OpLoweringGroup> OpLoweringGroup::Clone(
     ops_mapper[op] = new_op;
   }
 
+  const auto new_fn_name = this->fn_name_ + "_cloned";
   // Construct Base information for new Group
-  auto new_group = std::make_shared<OpLoweringGroup>(new_ops);
+  auto new_group = std::make_shared<OpLoweringGroup>(new_ops, new_fn_name);
   for (auto* op : this->output_ops_) {
     new_group->output_ops_.insert(ops_mapper.at(op));
   }
diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_group.h b/paddle/cinn/hlir/framework/pir/op_lowering_group.h
index 935e759ed2331..7595985d4d5b9 100644
--- a/paddle/cinn/hlir/framework/pir/op_lowering_group.h
+++ b/paddle/cinn/hlir/framework/pir/op_lowering_group.h
@@ -42,15 +42,13 @@ class OpLoweringGroup {
   OpLoweringGroup(const OpLoweringGroup&) = delete;
   OpLoweringGroup(OpLoweringGroup&&) = delete;
 
-  explicit OpLoweringGroup(const std::vector<::pir::Operation*>& group_ops)
-      : ops_(group_ops) {
-    fn_name_ = CompatibleInfo::GroupOpsName(ops_);
-  }
+  explicit OpLoweringGroup(const std::vector<::pir::Operation*>& group_ops,
+                           const std::string& fn_name)
+      : ops_(group_ops), fn_name_(fn_name) {}
 
-  explicit OpLoweringGroup(std::initializer_list<::pir::Operation*> group_ops)
-      : ops_(group_ops) {
-    fn_name_ = CompatibleInfo::GroupOpsName(ops_);
-  }
+  explicit OpLoweringGroup(std::initializer_list<::pir::Operation*> group_ops,
+                           const std::string& fn_name)
+      : ops_(group_ops), fn_name_(fn_name) {}
 
   const std::string& FuncName() const { return this->fn_name_; }
   ::pir::Block* GetParentBlock() const;
diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc b/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc
index 8ba8753a84eaf..4c4362aec935d 100644
--- a/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc
+++ b/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc
@@ -20,6 +20,7 @@
 #include "paddle/cinn/ast_gen_ius/tensor_group.h"
 #include "paddle/cinn/backends/codegen_device_util.h"
 #include "paddle/cinn/common/dim_expr_converter.h"
+#include "paddle/cinn/common/target.h"
 #include "paddle/cinn/hlir/dialect/operator/ir/manual_op.h"
 #include "paddle/cinn/hlir/dialect/operator/transforms/group_merge/op_with_group_merge_util.h"
 #include "paddle/cinn/hlir/framework/compile_error.h"
@@ -124,19 +125,9 @@ std::shared_ptr<GroupInfo> OpLowererImpl::GetGroupInfo(
     }
   }
 
-  BuildBroadcastInfo(group, group_info);
-
   for (auto& op : group->output_ops()) {
     group_info->direct_output_var_names.insert(ValueName(op->result(0)));
     // collect all output tensor.
-    if (op->name() == "cinn_op.yield_store") {
-      auto input_var_name = ValueName(op->operand_source(0));
-      if (group_info->broadcast_info.count(input_var_name)) {
-        auto base_info = group_info->broadcast_info[input_var_name];
-        base_info.with_constrain = true;
-        group_info->broadcast_info[ValueName(op->result(0))] = base_info;
-      }
-    }
     for (auto opresult : op->results()) {
       if (tensor_map.count(opresult) == 0) {
         continue;
@@ -146,13 +137,7 @@ std::shared_ptr<GroupInfo> OpLowererImpl::GetGroupInfo(
   }
 
   for (const auto& val : group->output_values()) {
-    if (val.defining_op()->name() == "cinn_op.reshape" &&
-        erase_reshape.count(val.defining_op())) {
-      group_info->direct_output_var_names.insert(
-          ValueName(val.defining_op()->operand_source(0)));
-    } else {
-      group_info->direct_output_var_names.insert(ValueName(val));
-    }
+    group_info->direct_output_var_names.insert(ValueName(val));
   }
   return group_info;
 }
@@ -207,6 +192,8 @@ BucketLoweredFuncsWrapper OpLowererImpl::BucketLower(
   if (ops.size() == 1 && ops[0]->name() == "custom_call") {
     return {{{ir::Expr(1), LowerCustomCall(group)[0]}}, ir::LoweredFunc()};
   }
+  auto X86Expr = LowerX86(group, ops, apply_op_schedule);
+  VLOG(3) << "After x86 lower, ir is: \n" << X86Expr;
 
   std::vector<ir::Tensor> group_func_arg_tensors;
   std::unordered_map<::pir::Value, ir::Tensor> tensor_map;
@@ -272,6 +259,9 @@ BucketLoweredFuncsWrapper OpLowererImpl::BucketLower(
                                   ir_sch.GetModule().GetExprs()[0]);
   }
 
+  // The last func is stored as a kernel on x86
+  cond2func_bodies.emplace_back(ir::Expr(true), X86Expr);
+
   // 3.Do post-processing,
   // including preparing function args and temporary variables,
   // applying low-level optimization passes, etc.
@@ -296,10 +286,16 @@ BucketLoweredFuncsWrapper OpLowererImpl::BucketLower(
                         "The size of funcs and cond2func_bodies should be "
                         "the same."));
   BucketLoweredFuncsWrapper funcs_wrapper;
-  for (int i = 0; i < funcs.size(); ++i) {
+  for (int i = 0; i < funcs.size() - 1; ++i) {
     funcs_wrapper.predicate2funcs.emplace_back(cond2func_bodies[i].first,
                                                funcs[i]);
   }
+  // The last func is x86 kernel.
+  for (size_t i = funcs.size() - 1; i < funcs.size(); ++i) {
+    funcs[i]->name = funcs[i]->name + "_CX86";
+    funcs_wrapper.predicate2funcsCX86.emplace_back(cond2func_bodies[i].first,
+                                                   funcs[i]);
+  }
   funcs_wrapper.infer_shape_func =
       GenerateInferShapeFunc(group, infer_shape_tensor_args, group_func_args);
 
@@ -514,159 +510,6 @@ std::vector<ir::LoweredFunc> OpLowererImpl::LowerGroup(
                      &infer_shape_args);
 }
 
-void OpLowererImpl::BuildBroadcastInfo(const OpLoweringGroupPtr& group,
-                                       std::shared_ptr<GroupInfo> group_info) {
-  // TODO(phlrain): this is primary verion for loop aligment
-  // will be update by a new method
-  auto& align_info = group->mut_alignment_schedule_info();
-
-  auto& ops = group->ops();
-  for (auto op1 : ops) {
-    auto it = align_info.find(op1);
-    if (it == align_info.end()) {
-      continue;
-    }
-    if (op1->name() == "cinn_op.generate_shape") {
-      continue;
-    }
-
-    if (it->second.size() > 1) {
-      for (size_t i = 0; i < it->second.size(); ++i) {
-      }
-      // TODO(phlran): merge to factor info here
-      it->second.front().factor_info = it->second.back().factor_info;
-      it->second.resize(1);
-    }
-
-    PADDLE_ENFORCE_EQ(
-        it->second.size(),
-        1,
-        phi::errors::Unimplemented("%s, only suppopt one transform yet",
-                                   it->first->name()));
-
-    if (it->second[0].type == ScheduleAlignType::kBroadcast) {
-      // get broadcast op
-      auto broadcast_axes = it->second[0].axis_info;
-      auto output_shape = it->second[0].factor_info;
-
-      phi::DDim in_dim;
-
-      if (it->first->name() == "cinn_op.reshape") {
-        // TODO(phlrain): deal with reshape in a better way
-        if (it->first->result(0).use_count() == 1 &&
-            it->first->result(0).first_use().owner()->isa<::pir::YieldOp>()) {
-          continue;
-        }
-      }
-
-      if ((it->first->name() != "cinn_op.reshape") &&
-          (it->first->name() != "cinn_op.broadcast") &&
-          (it->first->num_operands() == 1)) {
-        in_dim = it->first->operand_source(0)
-                     .type()
-                     .dyn_cast<paddle::dialect::DenseTensorType>()
-                     .dims();
-      } else {
-        in_dim = it->first->result(0)
-                     .type()
-                     .dyn_cast<paddle::dialect::DenseTensorType>()
-                     .dims();
-      }
-
-      cinn::ir::BroadcastInfo info;
-      if (in_dim.size() == 1u && in_dim[0] == 1u) {
-        info.full_broadcast = true;
-        for (size_t i = 0; i < output_shape.size(); ++i) {
-          info.broadcast_axes.push_back(i);
-          info.output_shape.push_back(-1);
-          info.output_dim_expr.push_back(group->loop_ranges_expr()[i]);
-        }
-      } else if (in_dim.size() == broadcast_axes.size()) {
-        if (in_dim.size() != output_shape.size()) {
-          info.split_first = true;
-
-          if (broadcast_axes.size() == 1) {
-            std::vector<int> temp_shape(output_shape.size(), 1);
-            temp_shape[broadcast_axes[0]] = output_shape[broadcast_axes[0]];
-            info.split_info.emplace_back(0, temp_shape);
-
-            for (size_t i = 0; i < output_shape.size(); ++i) {
-              if (i != broadcast_axes[0]) {
-                info.broadcast_axes.push_back(i);
-                info.output_shape.push_back(output_shape[i]);
-              }
-            }
-          } else {
-            throw std::runtime_error("not support multi dim broadcast yet");
-          }
-        } else {
-          for (size_t i = 0; i < broadcast_axes.size(); ++i) {
-            if (in_dim[i] < 0 || output_shape[broadcast_axes[i]] < 0) {
-              continue;
-            }
-            if (in_dim[i] != output_shape[broadcast_axes[i]]) {
-              if (in_dim[i] != 1) {
-                throw std::runtime_error("Only support 1 - D broadcast ");
-              }
-              info.broadcast_axes.push_back(i);
-              info.output_shape.push_back(output_shape[broadcast_axes[i]]);
-            }
-          }
-        }
-      } else {
-        // only deal with broadcast axes
-        std::set<int> axes_set;
-        for (size_t i = 0; i < broadcast_axes.size(); ++i) {
-          axes_set.insert(broadcast_axes[i]);
-          if (in_dim[broadcast_axes[i]] != 1) {
-            throw std::runtime_error("Only support 1 - D broadcast ");
-          }
-
-          info.broadcast_axes.push_back(broadcast_axes[i]);
-          info.output_shape.push_back(output_shape[broadcast_axes[i]]);
-        }
-      }
-
-      for (size_t i = 0; i < it->first->num_operands(); ++i) {
-        if (!align_info.count(it->first->operand_source(i).defining_op())) {
-          info.first_broadcast = true;
-          break;
-        }
-      }
-
-      auto op_out = it->first->result(0);
-      info.op_name = it->first->name();
-
-      if (op_out.use_count() == 1 &&
-          op_out.first_use().owner()->name() == "cf.yield") {
-        info.with_constrain = true;
-      }
-
-      if (erase_reshape.count(op_out.first_use().owner())) {
-        info.with_constrain = true;
-      }
-
-      group_info->broadcast_info[ValueName(op_out)] = info;
-
-      for (auto use_it = op_out.use_begin(); use_it != op_out.use_end();
-           ++use_it) {
-        if (use_it->owner()->name() == "cf.yield") {
-          continue;
-        }
-        if (CompatibleInfo::OpKind(*(use_it->owner())) ==
-            framework::kBroadcast) {
-          if (!info.full_broadcast) {
-            group_info->broadcast_to_elementwise[ValueName(
-                use_it->owner()->result(0))] = info;
-          }
-        }
-      }
-    } else {
-      throw std::runtime_error("only supportbroadcast type for now");
-    }
-  }
-}
-
 std::vector<ir::LoweredFunc> OpLowererImpl::LowerCustomCall(
     const OpLoweringGroupPtr& group) {
   const auto& ops = group->ops();
@@ -777,10 +620,6 @@ std::vector<ir::LoweredFunc> OpLowererImpl::PostProcess(
       }
     }
     infer_shape_arg_tensor->push_back(tensor);
-    if ((op_result.defining_op()->name() == "cinn_op.reshape") &&
-        erase_reshape.count(op_result.defining_op())) {
-      tensor = tensor_map.at(op_result.defining_op()->operand_source(0));
-    }
 
     if (arg_name_set.count(tensor->buffer->name) != 0) {
       continue;
@@ -846,18 +685,21 @@ std::vector<ir::LoweredFunc> OpLowererImpl::PostProcess(
     }
   }
   std::vector<ir::LoweredFunc> lowered_funcs;
-  for (ir::Expr func_body : func_bodies) {
+  for (int i = 0; i < func_bodies.size(); ++i) {
+    ir::Expr func_body = func_bodies[i];
     optim::EliminateDeadScheduleBlock(&(func_body), group->output_names());
-    cinn::common::DefaultDeviceTarget().arch.Match(
-        [&](std::variant<common::UnknownArch,
-                         common::X86Arch,
-                         common::ARMArch>) {},
-        [&](common::NVGPUArch) {
+    if (i != func_bodies.size() - 1) {
+      cinn::common::DefaultDeviceTarget().arch.Match(
+          [&](std::variant<common::UnknownArch,
+                           common::X86Arch,
+                           common::ARMArch>) {},
+          [&](common::NVGPUArch) {
 #ifdef CINN_WITH_CUDA
-          optim::EliminateCommonGlobalMemoryRead(&(func_body));
-          optim::OptimizeExprGPU(&(func_body));
+            optim::EliminateCommonGlobalMemoryRead(&(func_body));
+            optim::OptimizeExprGPU(&(func_body));
 #endif
-        });
+          });
+    }
 
     // 2.Prepare temp buffers
     auto temp_buffers =
@@ -869,8 +711,13 @@ std::vector<ir::LoweredFunc> OpLowererImpl::PostProcess(
       func->PrepareBufferCastExprs();
     }
     // 4.Apply low level pass
-    func = optim::Optimize(Expr(func), target_, false).as_lowered_func_ref();
-    optim::RearrangeLoadInstruction(&(func->body));
+    if (i != func_bodies.size() - 1) {
+      func = optim::Optimize(Expr(func), target_, false).as_lowered_func_ref();
+      optim::RearrangeLoadInstruction(&(func->body));
+    } else {
+      func = optim::Optimize(Expr(func), common::DefaultHostTarget(), false)
+                 .as_lowered_func_ref();
+    }
     lowered_funcs.push_back(std::move(func));
   }
 
@@ -1327,6 +1174,73 @@ ir::LoweredFunc OpLowererImpl::GenerateInferShapeFunc(
                               {});
   return infer_shape_func;
 }
+ir::Expr OpLowererImpl::LowerX86(const OpLoweringGroupPtr& group,
+                                 const std::vector<::pir::Operation*>& ops,
+                                 bool apply_op_schedule) {
+  std::vector<ir::Tensor> group_func_arg_tensors;
+  std::unordered_map<::pir::Value, ir::Tensor> tensor_map;
+  // for some op, it will output more tmp value and regard as
+  // XX_0, XX_1, so we log them in tmp_tensor_info;
+  std::unordered_map<std::string, ir::Tensor> tmp_tensor_info;
+
+  auto need_lower_x86 = [&]() -> bool {
+    for (auto* op : ops) {
+      for (size_t i = 0; i < op->num_operands(); ++i) {
+        auto in = op->operand_source(i);
+        auto type_info = in.type().dyn_cast<paddle::dialect::DenseTensorType>();
+        auto dtype = type_info.dtype();
+        const auto& dims = type_info.dims();
+        std::vector<ir::Dim> sym_shape;
+        // 1. dynamic shape not need lower x86
+        if (::common::contain_unknown_dim(dims)) {
+          return false;
+        }
+        // 2. size < 4 not need lower x86
+        int64_t sym_shape_size = 1;
+        for (int i = 0; i < dims.size(); ++i) {
+          sym_shape_size *= dims[i];
+          if (sym_shape_size > 4) {
+            return false;
+          }
+        }
+      }
+
+      std::vector<Type> out_types;
+      std::vector<std::vector<ir::Dim>> out_shapes;
+      CollectOutputInfo(op, &out_types, &out_shapes, group);
+      for (const auto& tt : out_types) {
+        // 3. float16 not need lower x86
+        if (tt.is_float16()) {
+          return false;
+        }
+      }
+    }
+    return true;
+  };
+  if (!need_lower_x86()) {
+    return ir::Expr(-1);
+  }
+
+  this->target_ = common::DefaultHostTarget();
+  cinn::runtime::CurrentTarget::SetCurrentTarget(this->target_);
+
+  std::vector<ir::Expr> func_bodies =
+      LowerOps(group,
+               ops,
+               apply_op_schedule,
+               &OpLowererImpl::DyShapeScheduleDetermineFunction,
+               &group_func_arg_tensors,
+               &tensor_map,
+               &tmp_tensor_info);
+  this->target_ = common::DefaultNVGPUTarget();
+  cinn::runtime::CurrentTarget::SetCurrentTarget(this->target_);
+  ir::ModuleExpr mod_expr(func_bodies);
+  ir::IRSchedule ir_sch(
+      mod_expr, -1, false, cinn::utils::ErrorMessageLevel::kGeneral, true);
+  ir_sch.MergeExprs();
+  auto X86Expr = ir::ir_utils::IRCopy(ir_sch.GetModule().GetExprs().at(0));
+  return X86Expr;
+}
 
 }  // namespace pir
 }  // namespace framework
diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_impl.h b/paddle/cinn/hlir/framework/pir/op_lowering_impl.h
index 838b70da20fa5..9edb88ec3e431 100644
--- a/paddle/cinn/hlir/framework/pir/op_lowering_impl.h
+++ b/paddle/cinn/hlir/framework/pir/op_lowering_impl.h
@@ -57,10 +57,6 @@ struct GroupInfo {
   std::set<std::string> shared_var_names;
   std::set<std::string> direct_output_var_names;
   std::vector<std::string> broadcast_output_names;
-
-  std::unordered_map<std::string, cinn::ir::BroadcastInfo> broadcast_info;
-  std::unordered_map<std::string, cinn::ir::BroadcastInfo>
-      broadcast_to_elementwise;
 };
 
 class OpLowererImpl : public OpLowererImplBase<OpLoweringGroupPtr> {
@@ -296,12 +292,11 @@ class OpLowererImpl : public OpLowererImplBase<OpLoweringGroupPtr> {
 
   void BuildBroadcastInfo(const OpLoweringGroupPtr& group,
                           std::shared_ptr<GroupInfo> group_info);
-
   Target target_;
-
+  ir::Expr LowerX86(const OpLoweringGroupPtr& group,
+                    const std::vector<::pir::Operation*>& ops,
+                    bool apply_op_schedule);
   PrettyNamer* name_gene_;
-
-  std::unordered_set<::pir::Operation*> erase_reshape;
 };
 
 }  // namespace pir
diff --git a/paddle/cinn/hlir/framework/pir/utils.h b/paddle/cinn/hlir/framework/pir/utils.h
index c489e1847f26f..e3e4e8163cfb9 100644
--- a/paddle/cinn/hlir/framework/pir/utils.h
+++ b/paddle/cinn/hlir/framework/pir/utils.h
@@ -33,6 +33,7 @@ struct CINNKernelInfo {
   std::string fn_name;
   void* fn_ptr;
   void* infer_shape_fn_ptr;
+  void* CX86_fn_ptr;
 
   struct ArgDimIdx {
     int arg_idx;
diff --git a/paddle/cinn/hlir/framework/pir_compiler.cc b/paddle/cinn/hlir/framework/pir_compiler.cc
index 666ae3d340138..2b13f8a0a5d9c 100644
--- a/paddle/cinn/hlir/framework/pir_compiler.cc
+++ b/paddle/cinn/hlir/framework/pir_compiler.cc
@@ -77,6 +77,8 @@ std::vector<pir::CINNKernelInfo> PirCompiler::Build(
     auto worker_fn = [&](int index) {
       CompilationTask task(&group_compilation_contexts[index]);
       compilation_results[index] = task();
+      // Triggering llvm compilation in thread
+      compilation_results[index]->GetKernelInfo();
     };
     utils::parallel_run(worker_fn,
                         utils::SequenceDispatcher(0, task_size),
diff --git a/paddle/cinn/hlir/pass/alterlayout.cc b/paddle/cinn/hlir/pass/alterlayout.cc
index 74c8c0915e0af..a747c57dd77af 100644
--- a/paddle/cinn/hlir/pass/alterlayout.cc
+++ b/paddle/cinn/hlir/pass/alterlayout.cc
@@ -20,7 +20,7 @@
 #include "paddle/cinn/hlir/pe/schedule.h"
 #include "paddle/cinn/ir/layout.h"
 #include "paddle/cinn/utils/string.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace hlir {
 namespace pass {
@@ -119,10 +119,26 @@ std::vector<framework::shape_t> UpdateInferInfos(
   CHECK(!infertypes.empty()) << node->op()->name << " finds no infertype";
   CHECK(!inferlayouts.empty()) << node->op()->name << " finds no inferlayout";
   auto outlinks = node->outlinks_in_order();
-  CHECK_EQ(infershapes.size(), infertypes.size());
-  CHECK_EQ(inferlayouts.size(), 2U);
-  CHECK_EQ(infertypes.size(), inferlayouts[0].size());
-  CHECK_EQ(outlinks.size(), infershapes.size());
+  PADDLE_ENFORCE_EQ(
+      infershapes.size(),
+      infertypes.size(),
+      phi::errors::InvalidArgument(
+          "The size of infershapes and infertypes should be equal"));
+  PADDLE_ENFORCE_EQ(inferlayouts.size(),
+                    2U,
+                    phi::errors::InvalidArgument(
+                        "The size of inferlayouts should be 2, but got %d",
+                        inferlayouts.size()));
+  PADDLE_ENFORCE_EQ(
+      infertypes.size(),
+      inferlayouts[0].size(),
+      phi::errors::InvalidArgument(
+          "The size of infertypes and inferlayouts[0] should be equal"));
+  PADDLE_ENFORCE_EQ(
+      outlinks.size(),
+      infershapes.size(),
+      phi::errors::InvalidArgument(
+          "The size of outlinks and infershapes should be equal"));
 
   for (int i = 0; i < outlinks.size(); i++) {
     auto* sink = outlinks[i]->sink();
@@ -181,7 +197,11 @@ void AlterLayoutPass(Graph* graph) {
               node->attrs.attr_store.at("dilation"));
         }
         const auto& conv_inlinks = node->inlinks_in_order();
-        CHECK_EQ(conv_inlinks.size(), 2U) << "conv2d should have 2 inputs";
+        PADDLE_ENFORCE_EQ(conv_inlinks.size(),
+                          2U,
+                          phi::errors::InvalidArgument(
+                              "conv2d should have 2 inputs, but got %d",
+                              conv_inlinks.size()));
         std::vector<std::vector<int>> inputs_shape;
         for (auto& link : conv_inlinks) {
           auto* source = link->source();
@@ -231,8 +251,11 @@ void AlterLayoutPass(Graph* graph) {
             input_nodes.push_back(source);
           }
           // get new layout: ic_bn, oc_bn
-          CHECK_EQ(input_nodes.size(), 2U)
-              << "conv2d should have 2 input nodes";
+          PADDLE_ENFORCE_EQ(input_nodes.size(),
+                            2U,
+                            phi::errors::InvalidArgument(
+                                "conv2d should have 2 input nodes, but got %d",
+                                input_nodes.size()));
           auto* input_node = input_nodes[0];
           auto* weight_node = input_nodes[1];
           CHECK(shape_dict.count(input_node->id()))
@@ -347,8 +370,11 @@ void AlterLayoutPass(Graph* graph) {
             conv2d_NCHWc_inputtypes.push_back(trans_out_dtypes);
             conv2d_NCHWc_inputlayouts.push_back(dst_input_layout);
           } else {
-            CHECK_EQ(input_shape.size(), 5U)
-                << "conv2d_NCHWc op's input shape dim should be 5";
+            PADDLE_ENFORCE_EQ(
+                input_shape.size(),
+                5U,
+                phi::errors::InvalidArgument(
+                    "conv2d_NCHWc op's input shape dim should be 5"));
             conv2d_NCHWc_inputshapes.push_back(input_shape);
             conv2d_NCHWc_inputtypes.push_back(input_type);
             CHECK(layout_dict.count(input_node->id()))
@@ -395,8 +421,11 @@ void AlterLayoutPass(Graph* graph) {
             conv2d_NCHWc_inputtypes.push_back(trans_out_dtypes);
             conv2d_NCHWc_inputlayouts.push_back(dst_kernel_layout);
           } else {
-            CHECK_EQ(weight_shape.size(), 6U)
-                << weight_node->id() << " shape dim should be 6";
+            PADDLE_ENFORCE_EQ(
+                weight_shape.size(),
+                6U,
+                phi::errors::InvalidArgument(
+                    "conv2d_NCHWc op's weight shape dim should be 6"));
             conv2d_NCHWc_inputshapes.push_back(weight_shape);
             conv2d_NCHWc_inputtypes.push_back(weight_type);
             CHECK(layout_dict.count(weight_node->id()))
@@ -477,12 +506,29 @@ void AlterLayoutPass(Graph* graph) {
               input_shapes, input_layouts, node->attrs, graph->target_);
           // if input inferred layouts is different from original's, expand dims
           // or do transformation.
-          CHECK_EQ(inferlayouts.size(), 2U);
+          PADDLE_ENFORCE_EQ(
+              inferlayouts.size(),
+              2U,
+              phi::errors::InvalidArgument(
+                  "The size of inferlayouts should be 2, but got %d",
+                  inferlayouts.size()));
           auto new_input_layouts = inferlayouts[1];
           auto inlinks = node->inlinks_in_order();
-          CHECK_EQ(input_layouts.size(), inlinks.size());
-          CHECK_EQ(input_layouts.size(), new_input_layouts.size());
-          CHECK_EQ(input_layouts.size(), input_shapes.size());
+          PADDLE_ENFORCE_EQ(
+              input_layouts.size(),
+              inlinks.size(),
+              phi::errors::InvalidArgument(
+                  "The size of input_layouts and inlinks should be equal"));
+          PADDLE_ENFORCE_EQ(input_layouts.size(),
+                            new_input_layouts.size(),
+                            phi::errors::InvalidArgument(
+                                "The size of input_layouts and "
+                                "new_input_layouts should be equal"));
+          PADDLE_ENFORCE_EQ(
+              input_layouts.size(),
+              input_shapes.size(),
+              phi::errors::InvalidArgument("The size of input_layouts and "
+                                           "input_shapes should be equal"));
           bool reset_axis = false;
           for (int i = 0; i < inlinks.size(); i++) {
             if (input_layouts[i] != new_input_layouts[i]) {
diff --git a/paddle/cinn/hlir/pass/check_fusion_accuracy_pass.cc b/paddle/cinn/hlir/pass/check_fusion_accuracy_pass.cc
index 0326a4a5fce33..c0bccf285c730 100644
--- a/paddle/cinn/hlir/pass/check_fusion_accuracy_pass.cc
+++ b/paddle/cinn/hlir/pass/check_fusion_accuracy_pass.cc
@@ -27,7 +27,7 @@
 #include "paddle/cinn/hlir/framework/visualize_helper.h"
 #include "paddle/cinn/hlir/pass/fusion_helper_base.h"
 #include "paddle/cinn/runtime/custom_function.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn::hlir::pass {
 
 using framework::Graph;
@@ -529,8 +529,10 @@ std::vector<Node*> CheckFusionAccuracyPass::TopologicalOrder(
     }
   }
 
-  CHECK_EQ(ordered_nodes.size(), nodes.size())
-      << "There has circle in group! Please check.";
+  PADDLE_ENFORCE_EQ(
+      ordered_nodes.size(),
+      nodes.size(),
+      phi::errors::InvalidArgument("There has circle in group! Please check."));
 
   return ordered_nodes;
 }
diff --git a/paddle/cinn/hlir/pass/check_fusion_accuracy_pass_test.cc b/paddle/cinn/hlir/pass/check_fusion_accuracy_pass_test.cc
index 10f5c83e6600d..447da47e147dc 100644
--- a/paddle/cinn/hlir/pass/check_fusion_accuracy_pass_test.cc
+++ b/paddle/cinn/hlir/pass/check_fusion_accuracy_pass_test.cc
@@ -19,7 +19,7 @@
 
 #include "paddle/cinn/common/target.h"
 #include "paddle/cinn/frontend/decomposer/test_helper.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn::frontend {
 
 using hlir::framework::Graph;
@@ -96,7 +96,11 @@ TEST(CheckFusionAccuracyPass, ElementWise_Fusion) {
   VLOG(1) << "After CheckFusionAccuracyPass:\n"
           << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
 
-  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    group_size_after,
+                    phi::errors::InvalidArgument(
+                        "The number of fusion groups is not equal to the "
+                        "number of groups after the pass."));
 
   RunTest(target, graph, {"A", "B", "C", "D"});
 }
@@ -134,7 +138,11 @@ TEST(CheckFusionAccuracyPass, ElementWise_Fusion_1) {
   VLOG(1) << "After CheckFusionAccuracyPass:\n"
           << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
 
-  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    group_size_after,
+                    phi::errors::InvalidArgument(
+                        "The number of fusion groups is not equal to the "
+                        "number of groups after the pass."));
 
   RunTest(target, graph, {"A", "B", "C", "D"});
 }
@@ -175,7 +183,11 @@ TEST(CheckFusionAccuracyPass, ElementWise_Fusion_2) {
   VLOG(1) << "After CheckFusionAccuracyPass:\n"
           << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
 
-  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    group_size_after,
+                    phi::errors::InvalidArgument(
+                        "The number of fusion groups is not equal to the "
+                        "number of groups after the pass."));
 
   RunTest(target, graph, {"A", "B", "C", "D", "E", "F"});
 }
@@ -216,7 +228,11 @@ TEST(CheckFusionAccuracyPass, ElementWise_Fusion_3) {
   VLOG(1) << "After CheckFusionAccuracyPass:\n"
           << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
 
-  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    group_size_after,
+                    phi::errors::InvalidArgument(
+                        "The number of fusion groups is not equal to the "
+                        "number of groups after the pass."));
 
   RunTest(target, graph, {"A", "B", "C", "D", "E", "F"});
 }
@@ -257,7 +273,11 @@ TEST(CheckFusionAccuracyPass, ElementWise_Fusion_4) {
   VLOG(1) << "After CheckFusionAccuracyPass:\n"
           << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
 
-  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    group_size_after,
+                    phi::errors::InvalidArgument(
+                        "The number of fusion groups is not equal to the "
+                        "number of groups after the pass."));
 
   RunTest(target, graph, {"A", "B", "C", "D", "E", "F"});
 }
@@ -291,7 +311,11 @@ TEST(CheckFusionAccuracyPass, ElementWise_Fusion_5) {
   VLOG(1) << "After CheckFusionAccuracyPass:\n"
           << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
 
-  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    group_size_after,
+                    phi::errors::InvalidArgument(
+                        "The number of fusion groups is not equal to the "
+                        "number of groups after the pass."));
 
   RunTest(target, graph, {"A", "B"});
 }
@@ -328,7 +352,11 @@ TEST(CheckFusionAccuracyPass, Broadcast_Test_0) {
   VLOG(1) << "After CheckFusionAccuracyPass:\n"
           << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
 
-  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    group_size_after,
+                    phi::errors::InvalidArgument(
+                        "The number of fusion groups is not equal to the "
+                        "number of groups after the pass."));
 
   RunTest(target, graph, {"A", "B", "C", "D"});
 }
@@ -365,7 +393,11 @@ TEST(CheckFusionAccuracyPass, Broadcast_Test_2) {
   VLOG(1) << "After CheckFusionAccuracyPass:\n"
           << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
 
-  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    group_size_after,
+                    phi::errors::InvalidArgument(
+                        "The number of fusion groups is not equal to the "
+                        "number of groups after the pass."));
 
   RunTest(target, graph, {"A", "B", "C", "D"});
 }
@@ -404,7 +436,11 @@ TEST(CheckFusionAccuracyPass, Broadcast_Test_4) {
   VLOG(1) << "After CheckFusionAccuracyPass:\n"
           << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
 
-  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    group_size_after,
+                    phi::errors::InvalidArgument(
+                        "The number of fusion groups is not equal to the "
+                        "number of groups after the pass."));
 
   RunTest(target, graph, {"A", "B", "C", "D", "E"});
 }
@@ -443,7 +479,11 @@ TEST(CheckFusionAccuracyPass, Broadcast_Test_5) {
   VLOG(1) << "After CheckFusionAccuracyPass:\n"
           << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
 
-  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    group_size_after,
+                    phi::errors::InvalidArgument(
+                        "The number of fusion groups is not equal to the "
+                        "number of groups after the pass."));
 
   RunTest(target, graph, {"A", "B", "C", "D", "E"});
 }
@@ -479,7 +519,11 @@ TEST(CheckFusionAccuracyPass, Reduce_Test_0) {
   VLOG(1) << "After CheckFusionAccuracyPass:\n"
           << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
 
-  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    group_size_after,
+                    phi::errors::InvalidArgument(
+                        "The number of fusion groups is not equal to the "
+                        "number of groups after the pass."));
 
   RunTest(target, graph, {"A", "B"});
 }
@@ -514,7 +558,11 @@ TEST(CheckFusionAccuracyPass, Reduce_Test_1) {
   VLOG(1) << "After CheckFusionAccuracyPass:\n"
           << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
 
-  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    group_size_after,
+                    phi::errors::InvalidArgument(
+                        "The number of fusion groups is not equal to the "
+                        "number of groups after the pass."));
 
   RunTest(target, graph, {"A", "B"});
 }
@@ -552,7 +600,11 @@ TEST(CheckFusionAccuracyPass, Reduce_Test_2) {
   VLOG(1) << "After CheckFusionAccuracyPass:\n"
           << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
 
-  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    group_size_after,
+                    phi::errors::InvalidArgument(
+                        "The number of fusion groups is not equal to the "
+                        "number of groups after the pass."));
 
   RunTest(target, graph, {"A", "B", "C"});
 }
@@ -590,7 +642,11 @@ TEST(CheckFusionAccuracyPass, Reduce_Test_3) {
   VLOG(1) << "After CheckFusionAccuracyPass:\n"
           << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
 
-  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    group_size_after,
+                    phi::errors::InvalidArgument(
+                        "The number of fusion groups is not equal to the "
+                        "number of groups after the pass."));
 
   RunTest(target, graph, {"A", "B", "C", "D"});
 }
@@ -629,7 +685,11 @@ TEST(CheckFusionAccuracyPass, Reduce_Test_4) {
   VLOG(1) << "After CheckFusionAccuracyPass:\n"
           << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
 
-  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    group_size_after,
+                    phi::errors::InvalidArgument(
+                        "The number of fusion groups is not equal to the "
+                        "number of groups after the pass."));
 
   RunTest(target, graph, {"A", "B", "C", "D"});
 }
@@ -665,7 +725,11 @@ TEST(CheckFusionAccuracyPass, Reduce_Test_5) {
   VLOG(1) << "After CheckFusionAccuracyPass:\n"
           << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
 
-  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    group_size_after,
+                    phi::errors::InvalidArgument(
+                        "The number of fusion groups is not equal to the "
+                        "number of groups after the pass."));
 
   RunTest(target, graph, {"A", "B"});
 }
diff --git a/paddle/cinn/hlir/pass/constant_folding_pass_util.cc b/paddle/cinn/hlir/pass/constant_folding_pass_util.cc
index 748948f2206fc..a6fb84f76b832 100644
--- a/paddle/cinn/hlir/pass/constant_folding_pass_util.cc
+++ b/paddle/cinn/hlir/pass/constant_folding_pass_util.cc
@@ -21,7 +21,7 @@
 #include "paddle/cinn/hlir/op/op_util.h"
 #include "paddle/cinn/utils/functional.h"
 #include "paddle/cinn/utils/type_defs.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace hlir {
 namespace pass {
@@ -238,7 +238,10 @@ void fold_expand_dims_fill_constant(const FusionHelperBase* helper,
   // [0, total_size-1]. check axes can't repeat.
   std::sort(axes.begin(), axes.end(), std::less<int>());
   for (int idx = 0; idx < axes_size - 1; ++idx) {
-    CHECK_NE(axes[idx], axes[idx + 1]);
+    PADDLE_ENFORCE_NE(axes[idx],
+                      axes[idx + 1],
+                      phi::errors::InvalidArgument(
+                          "The axes of expand_dims should not repeat."));
   }
   // insert 1 to new shape.
   std::vector<int> n_shape(total_size, 1);
diff --git a/paddle/cinn/hlir/pass/dce_pass.cc b/paddle/cinn/hlir/pass/dce_pass.cc
index b17f8ee4de5d9..2a68e90bc342a 100644
--- a/paddle/cinn/hlir/pass/dce_pass.cc
+++ b/paddle/cinn/hlir/pass/dce_pass.cc
@@ -16,7 +16,7 @@
 
 #include "paddle/cinn/common/type.h"
 #include "paddle/cinn/hlir/pass/op_fusion_pass_util.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace hlir {
 namespace pass {
@@ -118,7 +118,10 @@ class DceHelper : public FusionHelperBase {
 };
 
 void DCEPassInternal(Graph* graph) {
-  CHECK_GT(graph->outputs.size(), 0);
+  PADDLE_ENFORCE_GT(graph->outputs.size(),
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The graph should have at least one output node."));
   DceHelper dce_helper(graph);
   dce_helper();
 }
diff --git a/paddle/cinn/hlir/pass/dce_pass_test.cc b/paddle/cinn/hlir/pass/dce_pass_test.cc
index bb9c5d7654851..1ebc0878ee2cb 100644
--- a/paddle/cinn/hlir/pass/dce_pass_test.cc
+++ b/paddle/cinn/hlir/pass/dce_pass_test.cc
@@ -15,7 +15,7 @@
 #include <gtest/gtest.h>
 
 #include "paddle/cinn/frontend/decomposer/test_helper.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace frontend {
 
@@ -36,7 +36,10 @@ TEST(DCE, Test_0) {
       std::make_shared<hlir::framework::Graph>(program, fetch_ids, target);
   hlir::framework::ApplyPass(graph.get(), "DCE");
 
-  CHECK_EQ(graph->nodes().size(), 4);
+  PADDLE_ENFORCE_EQ(
+      graph->nodes().size(),
+      4,
+      phi::errors::InvalidArgument("The graph nodes's size should be 4."));
 }
 
 TEST(DCE, Test_1) {
@@ -59,7 +62,10 @@ TEST(DCE, Test_1) {
   auto graph =
       std::make_shared<hlir::framework::Graph>(program, fetch_ids, target);
   hlir::framework::ApplyPass(graph.get(), "DCE");
-  CHECK_EQ(graph->nodes().size(), 8);
+  PADDLE_ENFORCE_EQ(
+      graph->nodes().size(),
+      8,
+      phi::errors::InvalidArgument("The graph nodes's size should be 8."));
 }
 
 }  // namespace frontend
diff --git a/paddle/cinn/hlir/pass/dense_merge_pass.cc b/paddle/cinn/hlir/pass/dense_merge_pass.cc
index a726aa1a36c1a..1fc5e4a52b60d 100644
--- a/paddle/cinn/hlir/pass/dense_merge_pass.cc
+++ b/paddle/cinn/hlir/pass/dense_merge_pass.cc
@@ -15,7 +15,7 @@
 #include "paddle/cinn/common/graph_utils.h"
 #include "paddle/cinn/common/type.h"
 #include "paddle/cinn/hlir/pass/fusion_helper_base.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace hlir {
 namespace pass {
@@ -100,7 +100,13 @@ class DenseMergePassHelper : public FusionHelperBase {
     std::unordered_map<std::string, std::vector<Node*>> dense_op_map;
     for (auto dense_op : dense_ops) {
       const auto& in_links = dense_op->inlinks_in_order();
-      CHECK_GT(in_links.size(), pos);
+      PADDLE_ENFORCE_GT(in_links.size(),
+                        pos,
+                        phi::errors::InvalidArgument(
+                            "The input link size of dense op should be greater "
+                            "than %d, but got %d.",
+                            pos,
+                            in_links.size()));
       auto sign = GenOpSign(in_links[pos]->source()->safe_as<NodeData>(),
                             dense_op->attrs);
       if (dense_op_map.count(sign)) {
@@ -131,7 +137,14 @@ class DenseMergePassHelper : public FusionHelperBase {
         const auto& in_links = op->inlinks_in_order();
         node->UnLinkSingleTo(op);
         // link to new node
-        CHECK_GT(in_links.size(), pos);
+        PADDLE_ENFORCE_GT(
+            in_links.size(),
+            pos,
+            phi::errors::InvalidArgument("The input link size of dense "
+                                         "op should be greater than %d, "
+                                         "but got %d.",
+                                         pos,
+                                         in_links.size()));
         in_links[pos]->source()->LinkTo(node_tmp);
         // unlink old dense node
         in_links[pos]->source()->UnLinkSingleTo(op);
diff --git a/paddle/cinn/hlir/pass/dot_merger.cc b/paddle/cinn/hlir/pass/dot_merger.cc
index 941cf6b29b66c..6e4e4108ecd91 100644
--- a/paddle/cinn/hlir/pass/dot_merger.cc
+++ b/paddle/cinn/hlir/pass/dot_merger.cc
@@ -16,7 +16,7 @@
 #include "paddle/cinn/hlir/framework/graph.h"
 #include "paddle/cinn/hlir/framework/pass.h"
 #include "paddle/cinn/hlir/pass/infershape.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace hlir {
 namespace pass {
@@ -368,9 +368,12 @@ class DotMergerPass {
           input_operand(merge_nodes[i - 1], axis)->id());
       auto shape_b =
           builder->shape_dict().at(input_operand(merge_nodes[i], axis)->id());
-      CHECK_EQ(shape_a[1 - axis], shape_b[1 - axis])
-          << "The shape of matmul is error. " << shape_a.size() << ", "
-          << shape_b.size();
+      PADDLE_ENFORCE_EQ(
+          shape_a[1 - axis],
+          shape_b[1 - axis],
+          phi::errors::InvalidArgument("The shape of matmul is error. %d, %d",
+                                       shape_a.size(),
+                                       shape_b.size()));
       concat_nodes.push_back(input_operand(merge_nodes[i], axis));
     }
     auto* concat_out = builder->Concat(axis, concat_nodes);
@@ -444,9 +447,12 @@ class DotMergerPass {
     auto shape_shared = builder->shape_dict().at(shared_input->id());
     auto shape_a = builder->shape_dict().at(input_a->id());
     auto shape_b = builder->shape_dict().at(input_b->id());
-    CHECK_EQ(shape_a[1 - axis], shape_b[1 - axis])
-        << "The shape of matmul is error. " << shape_a.size() << ", "
-        << shape_b.size();
+    PADDLE_ENFORCE_EQ(
+        shape_a[1 - axis],
+        shape_b[1 - axis],
+        phi::errors::InvalidArgument("The shape of matmul is error. %d, %d",
+                                     shape_a.size(),
+                                     shape_b.size()));
     auto* concat_out = builder->Concat(axis, {input_a, input_b});
     NodeData* matmul_out{};
     if (!lhs) {
diff --git a/paddle/cinn/hlir/pass/fusion_helper_base.h b/paddle/cinn/hlir/pass/fusion_helper_base.h
index 3437b334fa5df..79580815d91bf 100644
--- a/paddle/cinn/hlir/pass/fusion_helper_base.h
+++ b/paddle/cinn/hlir/pass/fusion_helper_base.h
@@ -23,7 +23,7 @@
 #include "paddle/cinn/hlir/framework/pass.h"
 #include "paddle/cinn/hlir/pass/use_pass.h"
 #include "paddle/cinn/utils/string.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace hlir {
 namespace pass {
@@ -104,7 +104,10 @@ class FusionHelperBase {
 
   shape_t GetNodeInputShape(const Node* node) const {
     auto node_datas = GetProducerNodeData(node);
-    CHECK_GT(node_datas.size(), 0);
+    PADDLE_ENFORCE_GT(
+        node_datas.size(),
+        0,
+        phi::errors::InvalidArgument("The input node should not be empty!"));
     CHECK(shape_dict_.count(node_datas[0]->id()))
         << "Can't find " << node_datas[0]->id() << " 's shape!";
     return shape_dict_.at(node_datas[0]->id());
@@ -168,7 +171,10 @@ class FusionHelperBase {
 
   int GetSharedSize(const Node* node) const {
     auto producers = GetProducerNodeData(node);
-    CHECK_GT(producers.size(), 0);
+    PADDLE_ENFORCE_GT(
+        producers.size(),
+        0,
+        phi::errors::InvalidArgument("The input node should not be empty!"));
     auto inshape = shape_dict_.at(producers[0]->id());
     auto axes = absl::get<std::vector<int>>(node->attrs.attr_store.at("dim"));
     if (WithoutLastDimInReduce(inshape, axes)) {
diff --git a/paddle/cinn/hlir/pass/fusion_merge_pass.cc b/paddle/cinn/hlir/pass/fusion_merge_pass.cc
index fd023662f9050..0d93dd1593c4f 100644
--- a/paddle/cinn/hlir/pass/fusion_merge_pass.cc
+++ b/paddle/cinn/hlir/pass/fusion_merge_pass.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/cinn/hlir/pass/fusion_merge_pass_util.h"
-
+#include "paddle/common/enforce.h"
 PD_DECLARE_bool(enhance_vertical_fusion_with_recompute);
 
 namespace cinn {
@@ -705,7 +705,11 @@ class FusionMergePassHelper : public FusionHelperBase {
         }
       }
 
-      CHECK_GE(producer->consumer_groups().size(), candidates.size());
+      PADDLE_ENFORCE_GE(producer->consumer_groups().size(),
+                        candidates.size(),
+                        phi::errors::InvalidArgument(
+                            "The number of candidates should be less than or "
+                            "equal to the number of consumer groups!"));
       if (producer->consumer_groups().size() == 0 && candidates.size() == 0 &&
           output_nodes_set_.count(producer->CollectNodes()[0]) == 0) {
         producer->belong_groups.insert(*fusionable_consumers->begin());
@@ -959,8 +963,16 @@ class FusionMergePassHelper : public FusionHelperBase {
         CHECK(consumer->belong_groups.size());
         consumers.insert(*consumer->belong_groups.begin());
       }
-      CHECK_EQ(group->producer_groups().size(), producers.size());
-      CHECK_EQ(group->consumer_groups().size(), consumers.size());
+      PADDLE_ENFORCE_EQ(group->producer_groups().size(),
+                        producers.size(),
+                        phi::errors::InvalidArgument(
+                            "The number of producers should be equal to the "
+                            "number of producer groups!"));
+      PADDLE_ENFORCE_EQ(group->consumer_groups().size(),
+                        consumers.size(),
+                        phi::errors::InvalidArgument(
+                            "The number of consumers should be equal to the "
+                            "number of consumer groups!"));
       (*group->mut_producer_groups()) = producers;
       (*group->mut_consumer_groups()) = consumers;
     }
diff --git a/paddle/cinn/hlir/pass/fusion_merge_pass_test.cc b/paddle/cinn/hlir/pass/fusion_merge_pass_test.cc
old mode 100755
new mode 100644
index f6f9ecee97c43..14cc221edaaf0
--- a/paddle/cinn/hlir/pass/fusion_merge_pass_test.cc
+++ b/paddle/cinn/hlir/pass/fusion_merge_pass_test.cc
@@ -15,7 +15,7 @@
 #include <gtest/gtest.h>
 
 #include "paddle/cinn/frontend/decomposer/test_helper.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace frontend {
 
@@ -39,9 +39,15 @@ TEST(FusionMergePass, ElementWise_Fusion_0) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 3);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    3,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 3."));
   hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
-  CHECK_EQ(graph->fusion_groups.size(), 1);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    1,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 1."));
 }
 
 TEST(FusionMergePass, ElementWise_Fusion_1) {
@@ -65,9 +71,15 @@ TEST(FusionMergePass, ElementWise_Fusion_1) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 4);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    4,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 4."));
   hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
-  CHECK_EQ(graph->fusion_groups.size(), 1);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    1,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 1."));
 }
 
 TEST(FusionMergePass, ElementWise_Fusion_2) {
@@ -94,9 +106,15 @@ TEST(FusionMergePass, ElementWise_Fusion_2) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 5);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    5,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 5."));
   hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
-  CHECK_EQ(graph->fusion_groups.size(), 1);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    1,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 1."));
 }
 
 TEST(FusionMergePass, ElementWise_Fusion_3) {
@@ -123,9 +141,15 @@ TEST(FusionMergePass, ElementWise_Fusion_3) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 5);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    5,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 5."));
   hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
-  CHECK_EQ(graph->fusion_groups.size(), 1);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    1,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 1."));
 }
 
 TEST(FusionMergePass, ElementWise_Fusion_4) {
@@ -152,9 +176,15 @@ TEST(FusionMergePass, ElementWise_Fusion_4) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 5);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    5,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 5."));
   hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
-  CHECK_EQ(graph->fusion_groups.size(), 1);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    1,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 1."));
 }
 
 TEST(FusionMergePass, ElementWise_Fusion_5) {
@@ -174,9 +204,15 @@ TEST(FusionMergePass, ElementWise_Fusion_5) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 2);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 2."));
   hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
-  CHECK_EQ(graph->fusion_groups.size(), 1);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    1,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 1."));
 }
 
 TEST(FusionMergePass, Broadcast_Test_0) {
@@ -199,9 +235,15 @@ TEST(FusionMergePass, Broadcast_Test_0) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 1);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    1,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 1."));
   hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
-  CHECK_EQ(graph->fusion_groups.size(), 1);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    1,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 1."));
 }
 
 TEST(FusionMergePass, Broadcast_Test_1) {
@@ -224,9 +266,15 @@ TEST(FusionMergePass, Broadcast_Test_1) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 3);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    3,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 3."));
   hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
-  CHECK_EQ(graph->fusion_groups.size(), 1);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    1,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 1."));
 }
 
 TEST(FusionMergePass, Broadcast_Test_2) {
@@ -249,9 +297,15 @@ TEST(FusionMergePass, Broadcast_Test_2) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 3);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    3,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 3."));
   hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
-  CHECK_EQ(graph->fusion_groups.size(), 2);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 2."));
 }
 
 TEST(FusionMergePass, Broadcast_Test_3) {
@@ -274,9 +328,15 @@ TEST(FusionMergePass, Broadcast_Test_3) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 3);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    3,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 3."));
   hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
-  CHECK_EQ(graph->fusion_groups.size(), 2);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 2."));
 }
 
 TEST(FusionMergePass, Broadcast_Test_4) {
@@ -301,9 +361,15 @@ TEST(FusionMergePass, Broadcast_Test_4) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 4);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    4,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 4."));
   hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
-  CHECK_EQ(graph->fusion_groups.size(), 2);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 2."));
 }
 
 TEST(FusionMergePass, Broadcast_Test_5) {
@@ -328,9 +394,15 @@ TEST(FusionMergePass, Broadcast_Test_5) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 4);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    4,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 4."));
   hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
-  CHECK_EQ(graph->fusion_groups.size(), 3);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    3,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 3."));
 }
 
 TEST(FusionMergePass, Reduce_Test_0) {
@@ -352,7 +424,10 @@ TEST(FusionMergePass, Reduce_Test_0) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 4);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    4,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 4."));
   hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
   // CHECK_EQ(graph->fusion_groups.size(), 2);
 }
@@ -375,9 +450,15 @@ TEST(FusionMergePass, Reduce_Test_1) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 3);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    3,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 3."));
   hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
-  CHECK_EQ(graph->fusion_groups.size(), 2);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 2."));
 }
 
 TEST(FusionMergePass, Reduce_Test_2) {
@@ -401,9 +482,15 @@ TEST(FusionMergePass, Reduce_Test_2) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 3);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    3,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 3."));
   hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
-  CHECK_EQ(graph->fusion_groups.size(), 2);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 2."));
 }
 
 TEST(FusionMergePass, Reduce_Test_3) {
@@ -427,7 +514,10 @@ TEST(FusionMergePass, Reduce_Test_3) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 4);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    4,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 4."));
   hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
   // CHECK_EQ(graph->fusion_groups.size(), 3);
 }
@@ -454,7 +544,10 @@ TEST(FusionMergePass, Reduce_Test_4) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 5);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    5,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 5."));
   hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
   // CHECK_EQ(graph->fusion_groups.size(), 3);
 }
@@ -478,9 +571,15 @@ TEST(FusionMergePass, Reduce_Test_5) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 3);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    3,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 3."));
   hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
-  CHECK_EQ(graph->fusion_groups.size(), 1);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    1,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 1."));
 }
 
 }  // namespace frontend
diff --git a/paddle/cinn/hlir/pass/general_fusion_merge_pass.cc b/paddle/cinn/hlir/pass/general_fusion_merge_pass.cc
index b9d553019a459..b27565194f293 100644
--- a/paddle/cinn/hlir/pass/general_fusion_merge_pass.cc
+++ b/paddle/cinn/hlir/pass/general_fusion_merge_pass.cc
@@ -25,7 +25,7 @@
 #include "paddle/cinn/hlir/pass/general_fusion_merge_pass/lightware_fuse_pass.h"
 #include "paddle/cinn/hlir/pass/general_fusion_merge_pass/lightware_fuse_pass_ctx.h"
 #include "paddle/cinn/hlir/pass/general_fusion_merge_pass_utils.h"
-
+#include "paddle/common/enforce.h"
 PD_DECLARE_bool(enhance_vertical_fusion_with_recompute);
 
 namespace cinn {
@@ -840,7 +840,11 @@ class GeneralFusionMergePassHelper : public FusionHelperBase {
         }
       }
 
-      CHECK_GE(producer->consumer_groups().size(), candidates.size());
+      PADDLE_ENFORCE_GE(
+          producer->consumer_groups().size(),
+          candidates.size(),
+          phi::errors::Fatal("The number of candidates should be less than or "
+                             "equal to the number of consumers."));
       if (producer->consumer_groups().size() == 0 && candidates.size() == 0 &&
           output_nodes_set_.count(producer->CollectNodes()[0]) == 0) {
         producer->belong_groups.insert(*fusionable_consumers->begin());
@@ -1035,8 +1039,14 @@ class GeneralFusionMergePassHelper : public FusionHelperBase {
         CHECK(consumer->belong_groups.size());
         consumers.insert(*consumer->belong_groups.begin());
       }
-      CHECK_EQ(group->producer_groups().size(), producers.size());
-      CHECK_EQ(group->consumer_groups().size(), consumers.size());
+      PADDLE_ENFORCE_EQ(
+          group->producer_groups().size(),
+          producers.size(),
+          phi::errors::InvalidArgument("Producer size is not equal!"));
+      PADDLE_ENFORCE_EQ(
+          group->consumer_groups().size(),
+          consumers.size(),
+          phi::errors::InvalidArgument("Consumer size is not equal!"));
       (*group->mut_producer_groups()) = producers;
       (*group->mut_consumer_groups()) = consumers;
     }
diff --git a/paddle/cinn/hlir/pass/general_fusion_merge_pass_utils.h b/paddle/cinn/hlir/pass/general_fusion_merge_pass_utils.h
index 2195d4a4f947b..a8ccbcef27a16 100644
--- a/paddle/cinn/hlir/pass/general_fusion_merge_pass_utils.h
+++ b/paddle/cinn/hlir/pass/general_fusion_merge_pass_utils.h
@@ -16,7 +16,7 @@
 
 #include "paddle/cinn/api/op_group.h"
 #include "paddle/cinn/hlir/pass/fusion_merge_pass_util.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace hlir {
 namespace pass {
@@ -135,7 +135,10 @@ inline bool WithoutLastDimInReduce(const api::Shape& inshape,
 
 static int GetSharedSize(const api::OpNode& op_node) {
   const auto& producers = op_node.inputs();
-  CHECK_GT(producers.size(), 0);
+  PADDLE_ENFORCE_GT(producers.size(),
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The producer size should be greater than 0."));
   const auto& inshape = producers[0].shape();
   const auto& axes = op_node.GetAttr<std::vector<int>>("dim");
   if (WithoutLastDimInReduce(inshape, axes)) {
diff --git a/paddle/cinn/hlir/pass/infershape.cc b/paddle/cinn/hlir/pass/infershape.cc
index 041a63b42b57c..c6a7a6422d8a8 100644
--- a/paddle/cinn/hlir/pass/infershape.cc
+++ b/paddle/cinn/hlir/pass/infershape.cc
@@ -19,7 +19,7 @@
 #include "paddle/cinn/hlir/pass/use_pass.h"
 #include "paddle/cinn/hlir/pe/schedule.h"
 #include "paddle/cinn/utils/string.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace hlir {
 namespace pass {
@@ -76,16 +76,16 @@ void InferShape(Node* node,
   auto out_dtype =
       op_inferdtype[node->op()](inputs_dtype, node->attrs.attr_store);
 
-  CHECK_GE(node->outlinks_in_order().size(), out_shape.size())
-      << "The output number of node " << node->id() << " is "
-      << node->outlinks_in_order().size()
-      << " , which is smaller than the output shape size " << out_shape.size()
-      << " . And the op type is " << node->op()->name;
-  CHECK_GE(node->outlinks_in_order().size(), out_dtype.size())
-      << "The output number of node " << node->id() << " is "
-      << node->outlinks_in_order().size()
-      << " , which is smaller than the output dtype size " << out_dtype.size()
-      << " . And the op type is " << node->op()->name;
+  PADDLE_ENFORCE_GE(
+      node->outlinks_in_order().size(),
+      out_shape.size(),
+      phi::errors::InvalidArgument("The output number of node is smaller "
+                                   "than the output shape size"));
+  PADDLE_ENFORCE_GE(
+      node->outlinks_in_order().size(),
+      out_dtype.size(),
+      phi::errors::InvalidArgument("The output number of node is smaller "
+                                   "than the output dtype size"));
 
   int counter = 0;
   for (auto& out_edge : node->outlinks_in_order()) {
diff --git a/paddle/cinn/hlir/pass/op_fusion_pass_test.cc b/paddle/cinn/hlir/pass/op_fusion_pass_test.cc
old mode 100755
new mode 100644
index c9d723c91be50..8c18782cc031d
--- a/paddle/cinn/hlir/pass/op_fusion_pass_test.cc
+++ b/paddle/cinn/hlir/pass/op_fusion_pass_test.cc
@@ -15,7 +15,7 @@
 #include <gtest/gtest.h>
 
 #include "paddle/cinn/frontend/decomposer/test_helper.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace frontend {
 
@@ -39,7 +39,10 @@ TEST(OpFusionPass, ElementWise_Fusion_0) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 1);
+  PADDLE_ENFORCE_EQ(
+      graph->fusion_groups.size(),
+      1,
+      phi::errors::InvalidArgument("fusion group size should be 1"));
 }
 
 TEST(OpFusionPass, ElementWise_Fusion_1) {
@@ -63,7 +66,10 @@ TEST(OpFusionPass, ElementWise_Fusion_1) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 1);
+  PADDLE_ENFORCE_EQ(
+      graph->fusion_groups.size(),
+      1,
+      phi::errors::InvalidArgument("fusion group size should be 1"));
 }
 
 TEST(OpFusionPass, Broadcast_Test_0) {
@@ -86,7 +92,10 @@ TEST(OpFusionPass, Broadcast_Test_0) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 1);
+  PADDLE_ENFORCE_EQ(
+      graph->fusion_groups.size(),
+      1,
+      phi::errors::InvalidArgument("fusion group size should be 1"));
 }
 
 TEST(OpFusionPass, Broadcast_Test_1) {
@@ -111,7 +120,10 @@ TEST(OpFusionPass, Broadcast_Test_1) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 1);
+  PADDLE_ENFORCE_EQ(
+      graph->fusion_groups.size(),
+      1,
+      phi::errors::InvalidArgument("fusion group size should be 1"));
 }
 
 TEST(OpFusionPass, Broadcast_Test_2) {
@@ -131,7 +143,10 @@ TEST(OpFusionPass, Broadcast_Test_2) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 1);
+  PADDLE_ENFORCE_EQ(
+      graph->fusion_groups.size(),
+      1,
+      phi::errors::InvalidArgument("fusion group size should be 1"));
 }
 
 TEST(OpFusionPass, Reduce_Test_0) {
@@ -155,7 +170,10 @@ TEST(OpFusionPass, Reduce_Test_0) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 2);
+  PADDLE_ENFORCE_EQ(
+      graph->fusion_groups.size(),
+      2,
+      phi::errors::InvalidArgument("fusion group size should be 2"));
 }
 
 TEST(OpFusionPass, Reduce_Test_1) {
@@ -180,7 +198,10 @@ TEST(OpFusionPass, Reduce_Test_1) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 1);
+  PADDLE_ENFORCE_EQ(
+      graph->fusion_groups.size(),
+      1,
+      phi::errors::InvalidArgument("fusion group size should be 1"));
 }
 
 TEST(OpFusionPass, Reduce_Test_2) {
@@ -205,7 +226,10 @@ TEST(OpFusionPass, Reduce_Test_2) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 2);
+  PADDLE_ENFORCE_EQ(
+      graph->fusion_groups.size(),
+      2,
+      phi::errors::InvalidArgument("fusion group size should be 2"));
 }
 
 TEST(OpFusionPass, Injective_Test_0) {
@@ -229,7 +253,10 @@ TEST(OpFusionPass, Injective_Test_0) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 1);
+  PADDLE_ENFORCE_EQ(
+      graph->fusion_groups.size(),
+      1,
+      phi::errors::InvalidArgument("fusion group size should be 1"));
 }
 
 TEST(OP_LOWERING, Injective_Test_1) {
@@ -247,7 +274,10 @@ TEST(OP_LOWERING, Injective_Test_1) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 1);
+  PADDLE_ENFORCE_EQ(
+      graph->fusion_groups.size(),
+      1,
+      phi::errors::InvalidArgument("fusion group size should be 1"));
 }
 
 TEST(OpFusionPass, Test_Insert_BroadcastTo) {
@@ -269,7 +299,10 @@ TEST(OpFusionPass, Test_Insert_BroadcastTo) {
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
 
-  CHECK_EQ(graph->fusion_groups.size(), 1);
+  PADDLE_ENFORCE_EQ(
+      graph->fusion_groups.size(),
+      1,
+      phi::errors::InvalidArgument("fusion group size should be 1"));
 }
 
 }  // namespace frontend
diff --git a/paddle/cinn/hlir/pass/opfusion.cc b/paddle/cinn/hlir/pass/opfusion.cc
index c8690c0625fbb..84a4071144f96 100644
--- a/paddle/cinn/hlir/pass/opfusion.cc
+++ b/paddle/cinn/hlir/pass/opfusion.cc
@@ -21,7 +21,7 @@
 #include "paddle/cinn/hlir/framework/pass.h"
 #include "paddle/cinn/hlir/pass/use_pass.h"
 #include "paddle/cinn/utils/string.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace hlir {
 namespace pass {
@@ -48,8 +48,14 @@ void GetBroadcastPattern(
   if (*pattern == framework::kBroadcast) {
     auto inlinks = op_node->inlinks();
     auto outlinks = op_node->outlinks();
-    CHECK_EQ(inlinks.size(), 2U);
-    CHECK_EQ(outlinks.size(), 1U);
+    PADDLE_ENFORCE_EQ(
+        inlinks.size(),
+        2U,
+        phi::errors::InvalidArgument("Broadcast op should have 2 inputs"));
+    PADDLE_ENFORCE_EQ(
+        outlinks.size(),
+        1U,
+        phi::errors::InvalidArgument("Broadcast op should have 1 output"));
     std::vector<framework::shape_t> input_shapes;
     for (auto link : inlinks) {
       auto source = link->source();
@@ -233,7 +239,11 @@ class GraphPartition {
   std::vector<std::vector<Node*>> Partition(
       const std::vector<GraphNode*>& graph_nodes,
       const std::vector<DomNode*>& dom_nodes) {
-    CHECK_EQ(graph_nodes.size(), dom_nodes.size());
+    PADDLE_ENFORCE_EQ(
+        graph_nodes.size(),
+        dom_nodes.size(),
+        phi::errors::InvalidArgument(
+            "graph_nodes size should be equal to dom_nodes size"));
     InitGroups(graph_nodes);
     for (int i = 0; i < 2; i++) {
       FuseGroups(graph_nodes, dom_nodes, i);
@@ -457,8 +467,16 @@ class GraphPartition {
   void FuseGroups(const std::vector<GraphNode*>& graph_nodes,
                   const std::vector<DomNode*>& dom_nodes,
                   int phase) {
-    CHECK_EQ(graph_nodes.size(), dom_nodes.size());
-    CHECK_EQ(group_nodes_.size(), dom_nodes.size());
+    PADDLE_ENFORCE_EQ(
+        graph_nodes.size(),
+        dom_nodes.size(),
+        phi::errors::InvalidArgument(
+            "graph_nodes size should be equal to dom_nodes size"));
+    PADDLE_ENFORCE_EQ(
+        group_nodes_.size(),
+        dom_nodes.size(),
+        phi::errors::InvalidArgument(
+            "group_nodes size should be equal to dom_nodes size"));
     for (int i = 0; i < graph_nodes.size(); i++) {
       auto* graph_node = graph_nodes[i];
       auto* dom_node = dom_nodes[i];
@@ -521,7 +539,11 @@ class GraphPartition {
   }
   void SplitGroups(const std::vector<cinn::common::GraphNode*>& graph_nodes) {
     // split groups sorted by topo order
-    CHECK_EQ(graph_nodes.size(), group_nodes_.size());
+    PADDLE_ENFORCE_EQ(
+        graph_nodes.size(),
+        group_nodes_.size(),
+        phi::errors::InvalidArgument(
+            "graph_nodes size should be equal to group_nodes size"));
     absl::flat_hash_map<int, std::vector<Node*>> group_maps;
     std::set<int> root_indice;
     for (int i = 0; i < graph_nodes.size(); i++) {
diff --git a/paddle/cinn/hlir/pass/reduce_split_pass.cc b/paddle/cinn/hlir/pass/reduce_split_pass.cc
index 899c233866ca5..cbb6ffa658c47 100644
--- a/paddle/cinn/hlir/pass/reduce_split_pass.cc
+++ b/paddle/cinn/hlir/pass/reduce_split_pass.cc
@@ -18,7 +18,7 @@
 #include "paddle/cinn/hlir/framework/pass.h"
 #include "paddle/cinn/hlir/pass/infershape.h"
 #include "paddle/cinn/hlir/pe/nn_util.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace hlir {
 namespace pass {
@@ -103,7 +103,11 @@ class ReduceSplitPass {
         auto in_shape = shape_dict.at(in->id());
         auto out_shape = shape_dict.at(out->id());
         // all preceding reduced
-        CHECK_GT(in_shape.size(), 1);
+        PADDLE_ENFORCE_GT(
+            in_shape.size(),
+            1,
+            phi::errors::InvalidArgument(
+                "The input shape size should be greater than 1."));
         // [NHWC]->[C], only the last dim kept
         bool all_preceding_dim_reduced = true;
         for (auto i = 0; i < in_shape.size() - 1; ++i) {
@@ -122,7 +126,10 @@ class ReduceSplitPass {
             in_shape.begin(), in_shape.end(), 1, std::multiplies<int>());
         int reduce_numel = std::accumulate(
             in_shape.begin(), in_shape.end() - 1, 1, std::multiplies<int>());
-        CHECK_GT(reduce_numel, 0);
+        PADDLE_ENFORCE_GT(reduce_numel,
+                          0,
+                          phi::errors::InvalidArgument(
+                              "The reduce_numel should be greater than 0."));
         // if the numel is not large enough, it is no need to split
         // if loop times is too large with reduce optimize
         int size = std::accumulate(
@@ -132,7 +139,10 @@ class ReduceSplitPass {
         auto shape = pe::GetFirstStepReduceShape(
             {size, in_shape.back()}, {0}, bound, tail);
         CHECK(bound);
-        CHECK_EQ(shape.size(), 3);
+        PADDLE_ENFORCE_EQ(shape.size(),
+                          3,
+                          phi::errors::InvalidArgument(
+                              "The shape size should be equal to 3."));
 
         auto res = DivideToClosetNum(reduce_numel);
         int reduce_numel0 = std::get<0>(res), reduce_numel1 = std::get<1>(res);
diff --git a/paddle/cinn/hlir/pe/elementwise.cc b/paddle/cinn/hlir/pe/elementwise.cc
index 41eb7f2fd2c10..41deddc1507e3 100644
--- a/paddle/cinn/hlir/pe/elementwise.cc
+++ b/paddle/cinn/hlir/pe/elementwise.cc
@@ -360,8 +360,8 @@ ir::Tensor GenerateShape(const std::vector<ir::Tensor>& inputs,
                          const std::vector<symbol::DimExpr>& output_dim_exprs,
                          const std::string& name) {
   if (output_dim_exprs.size() != 1) {
-    LOG(WARNING) << "pe::GenerateShape will return a meaningless tensor when "
-                    "output_dim_exprs.size() != 1";
+    VLOG(4) << "pe::GenerateShape will return a meaningless tensor when "
+               "output_dim_exprs.size() != 1";
     return Compute(
         {Expr(1)},
         [=](const std::vector<Expr>& indice) { return Expr(1); },
diff --git a/paddle/cinn/hlir/pe/schedule_param.proto b/paddle/cinn/hlir/pe/schedule_param.proto
index 1d869a570706d..4d2fca1a1b362 100644
--- a/paddle/cinn/hlir/pe/schedule_param.proto
+++ b/paddle/cinn/hlir/pe/schedule_param.proto
@@ -1,11 +1,11 @@
 // Copyright (c) 2021 CINN Authors. All Rights Reserved.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
-// 
+//
 //     http://www.apache.org/licenses/LICENSE-2.0
-// 
+//
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/paddle/cinn/ir/group_schedule/base_group_scheduler.h b/paddle/cinn/ir/group_schedule/base_group_scheduler.h
index ef77397066351..a96b972d889ea 100644
--- a/paddle/cinn/ir/group_schedule/base_group_scheduler.h
+++ b/paddle/cinn/ir/group_schedule/base_group_scheduler.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+#include "paddle/cinn/common/macros.h"
 #include "paddle/cinn/common/target.h"
 #include "paddle/cinn/ir/group_schedule/config/group_tile_config.h"
 #include "paddle/cinn/ir/group_schedule/tactic/schedule_tactic.h"
@@ -64,6 +65,9 @@ class GroupScheduler {
   virtual void Schedule() = 0;
 
   virtual std::vector<std::pair<SymbolicPredicate, ir::Expr>> GetIRs() = 0;
+  virtual std::vector<std::pair<SymbolicPredicate, ir::Expr>> GetCX86IRs() {
+    CINN_NOT_IMPLEMENTED;
+  }
 
   std::unordered_set<std::string> OutputTensorNames() const;
 
diff --git a/paddle/cinn/ir/group_schedule/config/CMakeLists.txt b/paddle/cinn/ir/group_schedule/config/CMakeLists.txt
index f6453b645bdc7..256e919fce531 100644
--- a/paddle/cinn/ir/group_schedule/config/CMakeLists.txt
+++ b/paddle/cinn/ir/group_schedule/config/CMakeLists.txt
@@ -5,7 +5,10 @@ core_gather_headers()
 gather_srcs(cinnapi_src SRCS group_tile_config.cc)
 gather_srcs(cinnapi_src SRCS database.cc)
 
-cc_library(file_tile_database SRCS filedatabase.cc)
+cc_library(
+  file_tile_database
+  SRCS filedatabase.cc
+  DEPS absl tile_config_proto)
 
 foreach(header ${filetileconfig_proto_HDRS})
   set(core_proto_includes
diff --git a/paddle/cinn/ir/group_schedule/config/database.cc b/paddle/cinn/ir/group_schedule/config/database.cc
index a216530126efd..4e3121739b874 100644
--- a/paddle/cinn/ir/group_schedule/config/database.cc
+++ b/paddle/cinn/ir/group_schedule/config/database.cc
@@ -19,10 +19,16 @@ namespace ir {
 
 void NaiveTileConfigDatabase::AddConfig(
     const common::Target& target,
-    const IterSpaceType& iter_space_type,
     const BucketInfo& bucket_info,
     const ScheduleConfig::TileConfig& config,
     int priority) {
+  IterSpaceType iter_space_type = [&] {
+    std::vector<std::pair<std::string, std::string>> res;
+    for (const auto& dim : bucket_info.space) {
+      res.emplace_back(dim.iter_type, (dim.is_dynamic ? "dynamic" : "static"));
+    }
+    return res;
+  }();
   config_map_[iter_space_type][bucket_info] = config;
 }
 
diff --git a/paddle/cinn/ir/group_schedule/config/database.h b/paddle/cinn/ir/group_schedule/config/database.h
index 9d61f0dd615a5..14367ee492bba 100644
--- a/paddle/cinn/ir/group_schedule/config/database.h
+++ b/paddle/cinn/ir/group_schedule/config/database.h
@@ -32,7 +32,6 @@ using IterSpaceType = std::vector<std::pair<std::string, std::string>>;
 class TileConfigDatabase {
  public:
   virtual void AddConfig(const common::Target& target,
-                         const IterSpaceType& iter_space_type,
                          const BucketInfo& bucket_info,
                          const ScheduleConfig::TileConfig& config,
                          int priority) = 0;
@@ -45,7 +44,6 @@ class TileConfigDatabase {
 class NaiveTileConfigDatabase final : public TileConfigDatabase {
  public:
   void AddConfig(const common::Target& target,
-                 const IterSpaceType& iter_space_type,
                  const BucketInfo& bucket_info,
                  const ScheduleConfig::TileConfig& config,
                  int priority = 1) override;
diff --git a/paddle/cinn/ir/group_schedule/config/filedatabase.cc b/paddle/cinn/ir/group_schedule/config/filedatabase.cc
index 64741521802e9..58b5f13338f0a 100644
--- a/paddle/cinn/ir/group_schedule/config/filedatabase.cc
+++ b/paddle/cinn/ir/group_schedule/config/filedatabase.cc
@@ -39,22 +39,19 @@ namespace ir {
 
 bool TileConfigToProto(group_schedule::config::proto::TileData* tile_data,
                        const TileConfigMap& tile_config_map,
-                       const IterSpaceType& iter_space_type,
                        const int& priority) {
   for (auto& it : tile_config_map) {
-    group_schedule::config::proto::Dimension s_dimension, r_dimension;
-
     // prepare key---convert bucket info to proto::bucket_info
-    s_dimension.set_lower_bound(it.first.sp_lower_bound);
-    s_dimension.set_upper_bound(it.first.sp_upper_bound);
-    s_dimension.set_iter_type(iter_space_type[0].first);
-    s_dimension.set_is_dynamic(iter_space_type[0].second == "dynamic");
-    r_dimension.set_lower_bound(it.first.rb_lower_bound);
-    r_dimension.set_upper_bound(it.first.rb_upper_bound);
-    r_dimension.set_iter_type(iter_space_type[1].first);
-    r_dimension.set_is_dynamic(iter_space_type[1].second == "dynamic");
-    *(tile_data->mutable_bucket_info()->add_dimension()) = s_dimension;
-    *(tile_data->mutable_bucket_info()->add_dimension()) = r_dimension;
+    BucketInfo bucket_info = it.first;
+    int dims = bucket_info.space.size();
+    for (int i = 0; i < dims; i++) {
+      group_schedule::config::proto::Dimension cur_dimension;
+      cur_dimension.set_lower_bound(bucket_info.space[i].lower_bound);
+      cur_dimension.set_upper_bound(bucket_info.space[i].upper_bound);
+      cur_dimension.set_iter_type(bucket_info.space[i].iter_type);
+      cur_dimension.set_is_dynamic(bucket_info.space[i].is_dynamic);
+      *(tile_data->mutable_bucket_info()->add_dimension()) = cur_dimension;
+    }
 
     // prepare value---transfer tile_config to proto::tile_config
     group_schedule::config::proto::TileConfig tc;
@@ -114,18 +111,24 @@ std::string IterSpaceTypeToDir(const common::Target target,
 }
 
 bool FileTileConfigDatabase::Tofile(const common::Target& target,
-                                    const IterSpaceType& iter_space_type,
                                     int priority) {
   // Step1. To proto
   TileConfigMap& tile_config_map = target_config_data_;
   group_schedule::config::proto::TileData tile_data;
-  auto is_success =
-      TileConfigToProto(&tile_data, tile_config_map, iter_space_type, priority);
+  auto is_success = TileConfigToProto(&tile_data, tile_config_map, priority);
   if (is_success == false) {
     PADDLE_THROW(::common::errors::Unavailable(
         "Can't convert tile_config_map to its proto message."));
   }
   // Step2. ToJson
+  IterSpaceType iter_space_type = [&] {
+    std::vector<std::pair<std::string, std::string>> res;
+    auto bucket_info = tile_config_map.begin()->first;
+    for (const auto& dim : bucket_info.space) {
+      res.emplace_back(dim.iter_type, (dim.is_dynamic ? "dynamic" : "static"));
+    }
+    return res;
+  }();
   std::string dump_path = IterSpaceTypeToDir(target, iter_space_type);
   size_t length = tile_config_map.size();
   std::vector<std::string> json_lines(length);
@@ -187,7 +190,7 @@ bool comparepriority(group_schedule::config::proto::TileData tile_data1,
 
 TileConfigMap FileTileConfigDatabase::GetConfigs(
     const common::Target& target, const IterSpaceType& iter_space_type) const {
-  // Step1. ReadFromJsonFile->Message;
+  // Step 1: Read from json file and convert json to proto message
   std::string file_path = IterSpaceTypeToDir(target, iter_space_type);
   auto json_lines = ReadLinesFromFile(file_path);
   size_t line_length = json_lines.size();
@@ -196,39 +199,41 @@ TileConfigMap FileTileConfigDatabase::GetConfigs(
       line_length);
   JsonStringToMessageOfTileConfig(&tile_database, json_lines);
 
-  // Step2. ParseFromProtoMessage();
+  // Step 2: Parse from proto message
   TileConfigMap tile_config_map;
   // order tile_database according to priority
   std::sort(tile_database.begin(), tile_database.end(), comparepriority);
   for (const auto& piece_tileconfig : tile_database) {
     group_schedule::config::proto::BucketInfo its =
         piece_tileconfig.bucket_info();
-    // proto::BucketInfo to  bucketinfo
-    BucketInfo bucket_info;
-    bucket_info.sp_lower_bound = its.dimension(0).lower_bound();
-    bucket_info.sp_upper_bound = its.dimension(0).upper_bound();
-    bucket_info.rb_lower_bound = its.dimension(1).lower_bound();
-    bucket_info.rb_upper_bound = its.dimension(1).upper_bound();
+    //  Step 2.1: Convert proto bucketinfo to source bucketinfo
+    int dims = its.dimension_size();
+    BucketInfo bucket_info(static_cast<size_t>(dims));
+    for (int i = 0; i < dims; i++) {
+      bucket_info.space[i].lower_bound = its.dimension(i).lower_bound();
+      bucket_info.space[i].upper_bound = its.dimension(i).upper_bound();
+      bucket_info.space[i].iter_type = its.dimension(i).iter_type();
+      bucket_info.space[i].is_dynamic = its.dimension(i).is_dynamic();
+    }
+    //  Step 2.2: Convert proto tile_config to source tile_config
     ScheduleConfig::TileConfig tconfig;
     tconfig.tree_reduce_num = piece_tileconfig.tile_config().tree_reduce_num();
     tconfig.spatial_inner_num =
         piece_tileconfig.tile_config().spatial_inner_num();
     tconfig.warp_num = piece_tileconfig.tile_config().warp_num();
     tile_config_map[bucket_info] = tconfig;
-    // Tode[XiaZichao] Add function to cut one lattice into smaller ones.
+    // TODO(XiaZichao): Add function to cut one lattice into smaller ones
   }
-  // ToDo[XiaZichao] update json file using top view of tileconfigMap
+  // TODO(XiaZichao): update json file using top view of tileconfigMap
   return tile_config_map;
 }
 
 void FileTileConfigDatabase::AddConfig(const common::Target& target,
-                                       const IterSpaceType& iter_space_type,
                                        const BucketInfo& bucket_info,
                                        const ScheduleConfig::TileConfig& config,
                                        int priority) {
   target_config_data_[bucket_info] = config;
-  auto status =
-      FileTileConfigDatabase::Tofile(target, iter_space_type, priority);
+  auto status = FileTileConfigDatabase::Tofile(target, priority);
   if (status == true) {
     target_config_data_.clear();
     return;
diff --git a/paddle/cinn/ir/group_schedule/config/filedatabase.h b/paddle/cinn/ir/group_schedule/config/filedatabase.h
index 19758dc828c18..3c6b62c676fe8 100644
--- a/paddle/cinn/ir/group_schedule/config/filedatabase.h
+++ b/paddle/cinn/ir/group_schedule/config/filedatabase.h
@@ -22,7 +22,6 @@ namespace ir {
 class FileTileConfigDatabase : TileConfigDatabase {
  public:
   void AddConfig(const common::Target& target,
-                 const IterSpaceType& iter_space_type,
                  const BucketInfo& bucket_info,
                  const ScheduleConfig::TileConfig& config,
                  int priority) override;
@@ -31,9 +30,7 @@ class FileTileConfigDatabase : TileConfigDatabase {
 
  private:
   TileConfigMap target_config_data_;
-  bool Tofile(const common::Target& target,
-              const IterSpaceType& iter_space_type,
-              int priority);
+  bool Tofile(const common::Target& target, int priority);
 };
 
 }  // namespace ir
diff --git a/paddle/cinn/ir/group_schedule/config/group_tile_config.cc b/paddle/cinn/ir/group_schedule/config/group_tile_config.cc
index 40c1d134ac642..42f1a02adf723 100644
--- a/paddle/cinn/ir/group_schedule/config/group_tile_config.cc
+++ b/paddle/cinn/ir/group_schedule/config/group_tile_config.cc
@@ -20,6 +20,47 @@ namespace ir {
 
 const int kMaxNumel = INT32_MAX;
 
+BucketInfo::BucketInfo(int sp_lower_bound,
+                       int sp_upper_bound,
+                       int rb_lower_bound,
+                       int rb_upper_bound,
+                       bool sp_is_dynamic = false,
+                       bool rb_is_dynamic = false) {
+  BucketInfo::Dimension sp_dimension(
+      sp_lower_bound, sp_upper_bound, "S", sp_is_dynamic);
+  BucketInfo::Dimension rb_dimension(
+      rb_lower_bound, rb_upper_bound, "R", rb_is_dynamic);
+  this->space.push_back(sp_dimension);
+  this->space.push_back(rb_dimension);
+}
+
+bool BucketInfo::operator==(const BucketInfo& other) const {
+  if (this->space.size() != other.space.size()) {
+    return false;
+  }
+  int length = this->space.size();
+  for (int i = 0; i < length; i++) {
+    if (this->space[i].is_dynamic != other.space[i].is_dynamic ||
+        this->space[i].iter_type != other.space[i].iter_type ||
+        this->space[i].lower_bound != other.space[i].lower_bound ||
+        this->space[i].upper_bound != other.space[i].upper_bound) {
+      return false;
+    }
+  }
+  return true;
+}
+
+std::string BucketInfo::ToString() const {
+  std::stringstream ss;
+  ss << "BucketInfo: [";
+  for (const auto& dim : space) {
+    ss << dim.iter_type << "(" << dim.lower_bound << " - " << dim.upper_bound
+       << "), ";
+  }
+  ss << "]";
+  return ss.str();
+}
+
 int64_t Next2Power(int64_t n) {
   if (n == 1) {
     return 1;
@@ -34,8 +75,6 @@ std::shared_ptr<ScheduleConfig::BaseInfo> InitBasicInfo(
   base_info->reduce_tensor_names = group_info->reduce_var_names;
   base_info->shared_var_names = group_info->shared_var_names;
   base_info->direct_output_var_names = group_info->direct_output_var_names;
-  base_info->broadcast_info = group_info->broadcast_info;
-  base_info->broadcast_to_elementwise = group_info->broadcast_to_elementwise;
   base_info->data_rank = group_info->data_space.size();
   base_info->raw_data_rank = group_info->raw_data_rank;
 
@@ -190,7 +229,9 @@ BuildStaticSpatialConfig(
     BucketInfo bucket_info{/* sp_lower_bound = */ 1,
                            /* sp_upper_bound = */ 1,
                            /* rb_lower_bound = */ 1,
-                           /* rb_upper_bound = */ kMaxNumel};
+                           /* rb_upper_bound = */ kMaxNumel,
+                           /* sp_is_dynamic = */ false,
+                           /* rb_is_dynamic = */ true};
     ScheduleConfig::TileConfig tile_config{
         /* warp_num = */ 8,
         /* tree_reduce_num = */ 256,
@@ -201,7 +242,9 @@ BuildStaticSpatialConfig(
     BucketInfo bucket_info_1_256{/* sp_lower_bound = */ 1,
                                  /* sp_upper_bound = */ kMaxNumel,
                                  /* rb_lower_bound = */ 1,
-                                 /* rb_upper_bound = */ 256};
+                                 /* rb_upper_bound = */ 256,
+                                 /* sp_is_dynamic = */ false,
+                                 /* rb_is_dynamic = */ true};
     ScheduleConfig::TileConfig tile_config_1_256{
         /* warp_num = */ 8,
         /* tree_reduce_num = */ 32,
@@ -211,7 +254,9 @@ BuildStaticSpatialConfig(
     BucketInfo bucket_info_257_2048{/* sp_lower_bound = */ 1,
                                     /* sp_upper_bound = */ kMaxNumel,
                                     /* rb_lower_bound = */ 257,
-                                    /* rb_upper_bound = */ 2048};
+                                    /* rb_upper_bound = */ 2048,
+                                    /* sp_is_dynamic = */ false,
+                                    /* rb_is_dynamic = */ true};
     ScheduleConfig::TileConfig tile_config_257_2048{
         /* warp_num = */ 8,
         /* tree_reduce_num = */ 128,
@@ -221,7 +266,9 @@ BuildStaticSpatialConfig(
     BucketInfo bucket_info_2049_INF{/* sp_lower_bound = */ 1,
                                     /* sp_upper_bound = */ kMaxNumel,
                                     /* rb_lower_bound = */ 2049,
-                                    /* rb_upper_bound = */ kMaxNumel};
+                                    /* rb_upper_bound = */ kMaxNumel,
+                                    /* sp_is_dynamic = */ false,
+                                    /* rb_is_dynamic = */ true};
     ScheduleConfig::TileConfig tile_config_2049_INF{
         /* warp_num = */ 8,
         /* tree_reduce_num = */ 256,
@@ -242,7 +289,9 @@ BuildStaticReduceConfig(
     BucketInfo bucket_info__1_1023{/* sp_lower_bound = */ 1,
                                    /* sp_upper_bound = */ 1023,
                                    /* rb_lower_bound = */ 1,
-                                   /* rb_upper_bound = */ 1};
+                                   /* rb_upper_bound = */ 1,
+                                   /* sp_is_dynamic = */ true,
+                                   /* rb_is_dynamic = */ false};
     ScheduleConfig::TileConfig tile_config__1_1023{
         /* warp_num = */ -1,
         /* tree_reduce_num = */ 1,
@@ -251,7 +300,9 @@ BuildStaticReduceConfig(
     BucketInfo bucket_info__1024_1M{/* sp_lower_bound = */ 1024,
                                     /* sp_upper_bound = */ 1024 * 1024 - 1,
                                     /* rb_lower_bound = */ 1,
-                                    /* rb_upper_bound = */ 1};
+                                    /* rb_upper_bound = */ 1,
+                                    /* sp_is_dynamic = */ true,
+                                    /* rb_is_dynamic = */ false};
     ScheduleConfig::TileConfig tile_config__1024_1M{
         /* warp_num = */ 32,
         /* tree_reduce_num = */ 1,
@@ -260,7 +311,9 @@ BuildStaticReduceConfig(
     BucketInfo bucket_info__1M_INF{/* sp_lower_bound = */ 1024 * 1024,
                                    /* sp_upper_bound = */ kMaxNumel,
                                    /* rb_lower_bound = */ 1,
-                                   /* rb_upper_bound = */ 1};
+                                   /* rb_upper_bound = */ 1,
+                                   /* sp_is_dynamic = */ true,
+                                   /* rb_is_dynamic = */ false};
     ScheduleConfig::TileConfig tile_config__1M_INF{
         /* warp_num = */ 32,
         /* tree_reduce_num = */ 1,
@@ -273,7 +326,9 @@ BuildStaticReduceConfig(
     BucketInfo bucket_info{/* sp_lower_bound = */ 1,
                            /* sp_upper_bound = */ kMaxNumel,
                            /* rb_lower_bound = */ 2,
-                           /* rb_upper_bound = */ 256};
+                           /* rb_upper_bound = */ 256,
+                           /* sp_is_dynamic = */ true,
+                           /* rb_is_dynamic = */ false};
     ScheduleConfig::TileConfig tile_config{
         /* warp_num = */ 8,
         /* tree_reduce_num = */ 32,
@@ -290,7 +345,9 @@ BuildStaticReduceConfig(
     BucketInfo bucket_info{/* sp_lower_bound = */ 1,
                            /* sp_upper_bound = */ kMaxNumel,
                            /* rb_lower_bound = */ 257,
-                           /* rb_upper_bound = */ 2048};
+                           /* rb_upper_bound = */ 2048,
+                           /* sp_is_dynamic = */ true,
+                           /* rb_is_dynamic = */ false};
     ScheduleConfig::TileConfig tile_config{
         /* warp_num = */ warp_num,
         /* tree_reduce_num = */ tree_reduce_num,
@@ -304,7 +361,9 @@ BuildStaticReduceConfig(
     BucketInfo bucket_info{/* sp_lower_bound = */ 1,
                            /* sp_upper_bound = */ kMaxNumel,
                            /* rb_lower_bound = */ 2049,
-                           /* rb_upper_bound = */ kMaxNumel};
+                           /* rb_upper_bound = */ kMaxNumel,
+                           /* sp_is_dynamic = */ true,
+                           /* rb_is_dynamic = */ false};
     ScheduleConfig::TileConfig tile_config{
         /* warp_num = */ warp_num,
         /* tree_reduce_num = */ tree_reduce_num,
@@ -324,7 +383,9 @@ BuildDynamicShapeConfig(
   BucketInfo bucket_info{/* sp_lower_bound = */ 1,
                          /* sp_upper_bound = */ kMaxNumel,
                          /* rb_lower_bound = */ 1,
-                         /* rb_upper_bound = */ kMaxNumel};
+                         /* rb_upper_bound = */ kMaxNumel,
+                         /* sp_is_dynamic = */ true,
+                         /* rb_is_dynamic = */ true};
   ScheduleConfig::TileConfig tile_config{
       /* warp_num = */ warp_num,
       /* tree_reduce_num = */ tree_reduce_num,
diff --git a/paddle/cinn/ir/group_schedule/config/group_tile_config.h b/paddle/cinn/ir/group_schedule/config/group_tile_config.h
index a62d9dd84fb59..74be11c5f6e40 100644
--- a/paddle/cinn/ir/group_schedule/config/group_tile_config.h
+++ b/paddle/cinn/ir/group_schedule/config/group_tile_config.h
@@ -42,9 +42,6 @@ struct ScheduleConfig {
     std::set<std::string> temp_var_names;
     std::set<std::string> shared_var_names;
     std::set<std::string> direct_output_var_names;
-
-    std::unordered_map<std::string, BroadcastInfo> broadcast_info;
-    std::unordered_map<std::string, BroadcastInfo> broadcast_to_elementwise;
   };
 
   struct TileConfig {
@@ -59,27 +56,70 @@ struct ScheduleConfig {
 };
 
 struct BucketInfo {
-  int64_t sp_lower_bound = 1;
-  int64_t sp_upper_bound = INT64_MAX;
-  int64_t rb_lower_bound = 1;
-  int64_t rb_upper_bound = INT64_MAX;
-
-  bool operator==(const BucketInfo& other) const {
-    return this->sp_lower_bound == other.sp_lower_bound &&
-           this->sp_upper_bound == other.sp_upper_bound &&
-           this->rb_lower_bound == other.rb_lower_bound &&
-           this->rb_upper_bound == other.rb_upper_bound;
-  }
+  struct Dimension {
+    int lower_bound;
+    int upper_bound;
+    std::string iter_type;
+    bool is_dynamic;
+    std::vector<double> weights;
+    Dimension()
+        : lower_bound(0),
+          upper_bound(INT_MAX),
+          iter_type("S"),
+          is_dynamic(false) {}
+    Dimension(int low, int upper, std::string iter_type, bool is_dynamic)
+        : lower_bound(low),
+          upper_bound(upper),
+          iter_type(iter_type),
+          is_dynamic(is_dynamic) {}
+    Dimension(int low,
+              int upper,
+              std::string iter_type,
+              bool is_dynamic,
+              std::vector<double> weights)
+        : lower_bound(low),
+          upper_bound(upper),
+          iter_type(iter_type),
+          is_dynamic(is_dynamic),
+          weights(weights) {}
+  };
+  std::vector<Dimension> space;
+
+  std::string ToString() const;
+  BucketInfo() = default;
+  BucketInfo(int sp_lower_bound,
+             int sp_upper_bound,
+             int rb_lower_bound,
+             int rb_upper_bound,
+             bool sp_is_dynamic,
+             bool rb_is_dynamic);
+  explicit BucketInfo(size_t size) : space(std::vector<Dimension>(size)) {}
+  bool operator==(const BucketInfo& other) const;
 };
 
 struct BucketInfoHash {
   std::size_t operator()(const BucketInfo& bucket_info) const noexcept {
-    std::size_t hash_spl = std::hash<uint64_t>{}(bucket_info.sp_lower_bound);
-    std::size_t hash_spu = std::hash<uint64_t>{}(bucket_info.sp_upper_bound);
-    std::size_t hash_rbl = std::hash<uint64_t>{}(bucket_info.rb_lower_bound);
-    std::size_t hash_rbu = std::hash<uint64_t>{}(bucket_info.rb_upper_bound);
-    return adt::hash_combine(adt::hash_combine(hash_spl, hash_spu),
-                             adt::hash_combine(hash_rbl, hash_rbu));
+    PADDLE_ENFORCE_GT(
+        bucket_info.space.size(),
+        0,
+        ::common::errors::InvalidArgument(
+            "Bucketinfo 's dimension number should be more than 0"));
+
+    std::size_t hash_past_dims = adt::hash_combine(
+        std::hash<uint64_t>{}(bucket_info.space[0].lower_bound),
+        std::hash<uint64_t>{}(bucket_info.space[0].upper_bound));
+    int dims = bucket_info.space.size();
+    if (dims == 1) {
+      return hash_past_dims;
+    } else {
+      for (int i = 1; i < dims; i++) {
+        std::size_t hash_temp_dim = adt::hash_combine(
+            std::hash<uint64_t>{}(bucket_info.space[i].lower_bound),
+            std::hash<uint64_t>{}(bucket_info.space[i].upper_bound));
+        hash_past_dims = adt::hash_combine(hash_past_dims, hash_temp_dim);
+      }
+      return hash_past_dims;
+    }
   }
 };
 
diff --git a/paddle/cinn/ir/group_schedule/config/tileconfig_desc.proto b/paddle/cinn/ir/group_schedule/config/tileconfig_desc.proto
index f8e0aeadcfa09..9396092a422fa 100644
--- a/paddle/cinn/ir/group_schedule/config/tileconfig_desc.proto
+++ b/paddle/cinn/ir/group_schedule/config/tileconfig_desc.proto
@@ -1,11 +1,11 @@
 // Copyright (c) 2022 CINN Authors. All Rights Reserved.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
-// 
+//
 //     http://www.apache.org/licenses/LICENSE-2.0
-// 
+//
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -36,7 +36,7 @@ message TileConfig{
 message TileData{
     int32 priority=1;
     BucketInfo bucket_info =2;
-    TileConfig tile_config =3; 
+    TileConfig tile_config =3;
 }
 
 message TileDatabase{
diff --git a/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.cc b/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.cc
index 52a08c7a22900..c42ced360d86e 100644
--- a/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.cc
+++ b/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.cc
@@ -66,32 +66,42 @@ void DynamicShapeGroupScheduler::InitBuckets() {
             << iter_space_info.total_sp_extent;
     VLOG(4) << "iter_space_info.total_rb_extent: "
             << iter_space_info.total_rb_extent;
-    VLOG(4) << "bucket_info.sp_lower_bound: " << bucket_info.sp_lower_bound;
-    VLOG(4) << "bucket_info.sp_upper_bound: " << bucket_info.sp_upper_bound;
-    VLOG(4) << "bucket_info.rb_lower_bound: " << bucket_info.rb_lower_bound;
-    VLOG(4) << "bucket_info.rb_upper_bound: " << bucket_info.rb_upper_bound;
-    if (OutOfRange(iter_space_info.total_sp_extent,
-                   bucket_info.sp_lower_bound,
-                   bucket_info.sp_upper_bound) ||
-        OutOfRange(iter_space_info.total_rb_extent,
-                   bucket_info.rb_lower_bound,
-                   bucket_info.rb_upper_bound)) {
-      VLOG(4) << "Out of range";
-      return;
+    VLOG(4) << "bucket_info is: ";
+    int dims = bucket_info.space.size();
+    SymbolicPredicate predicate = ir::Expr(true);
+    for (int i = 0; i < dims; ++i) {
+      VLOG(4) << "bucket_info.space[" << i
+              << "].lower_bound= " << bucket_info.space[i].lower_bound;
+      VLOG(4) << "bucket_info.space[" << i
+              << "].upper_bound= " << bucket_info.space[i].upper_bound;
+      if (dims == 2 && bucket_info.space[1].iter_type == "R") {
+        if (i == 0 && OutOfRange(iter_space_info.total_sp_extent,
+                                 bucket_info.space[i].lower_bound,
+                                 bucket_info.space[i].upper_bound)) {
+          VLOG(4) << "Dimension " << i << " Out of range";
+          return;
+        }
+        if (i == 1 && OutOfRange(iter_space_info.total_rb_extent,
+                                 bucket_info.space[i].lower_bound,
+                                 bucket_info.space[i].upper_bound)) {
+          VLOG(4) << "Dimension " << i << " Out of range";
+          return;
+        }
+        auto extent = (i == 0) ? iter_space_info.total_sp_extent
+                               : iter_space_info.total_rb_extent;
+        SymbolicPredicate lower_bound_predicate =
+            ir::GE::Make(extent, ir::Expr(bucket_info.space[i].lower_bound));
+        SymbolicPredicate upper_bound_predicate =
+            ir::LE::Make(extent, ir::Expr(bucket_info.space[i].upper_bound));
+        SymbolicPredicate curr_predicate =
+            ir::And::Make(lower_bound_predicate, upper_bound_predicate);
+        predicate = ir::And::Make(predicate, curr_predicate);
+      } else {
+        PADDLE_THROW(::common::errors::Unimplemented(
+            "Now, the function InitBucket doesn't support the cases except "
+            "SR"));
+      }
     }
-    SymbolicPredicate sp_lower_bound_predicate = ir::GE::Make(
-        iter_space_info.total_sp_extent, ir::Expr(bucket_info.sp_lower_bound));
-    SymbolicPredicate sp_upper_bound_predicate = ir::LE::Make(
-        iter_space_info.total_sp_extent, ir::Expr(bucket_info.sp_upper_bound));
-    SymbolicPredicate rb_lower_bound_predicate = ir::GE::Make(
-        iter_space_info.total_rb_extent, ir::Expr(bucket_info.rb_lower_bound));
-    SymbolicPredicate rb_upper_bound_predicate = ir::LE::Make(
-        iter_space_info.total_rb_extent, ir::Expr(bucket_info.rb_upper_bound));
-    SymbolicPredicate sp_predicate =
-        ir::And::Make(sp_lower_bound_predicate, sp_upper_bound_predicate);
-    SymbolicPredicate rb_predicate =
-        ir::And::Make(rb_lower_bound_predicate, rb_upper_bound_predicate);
-    SymbolicPredicate predicate = ir::And::Make(sp_predicate, rb_predicate);
     ScheduleContext schedule_context{output_names,
                                      target_,
                                      std::move(iter_space_info),
@@ -154,6 +164,14 @@ DynamicShapeGroupScheduler::GetIRs() {
   return irs;
 }
 
+std::vector<std::pair<SymbolicPredicate, ir::Expr>>
+DynamicShapeGroupScheduler::GetCX86IRs() {
+  std::vector<std::pair<SymbolicPredicate, ir::Expr>> irs(1);
+  irs[0].first = ir::EQ::Make(ir::Expr(1), ir::Expr(1));
+  irs[1].second = ir_sch_->GetModule().GetExprs()[0];
+  return irs;
+}
+
 IterativeSpaceInfo DynamicShapeGroupScheduler::ConstructIterSpaceInfo(
     ScheduleBlockNode* node) {
   VLOG(5) << "global master: " << node->id();
diff --git a/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.h b/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.h
index 0e5205a419973..547d68b5a67a9 100644
--- a/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.h
+++ b/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.h
@@ -37,6 +37,7 @@ class DynamicShapeGroupScheduler : public GroupScheduler {
   void Schedule() override;
 
   std::vector<std::pair<SymbolicPredicate, ir::Expr>> GetIRs() override;
+  std::vector<std::pair<SymbolicPredicate, ir::Expr>> GetCX86IRs() override;
 
   struct BucketContext {
     SymbolicPredicate predicate;
diff --git a/paddle/cinn/ir/group_schedule/search/config_searcher.cc b/paddle/cinn/ir/group_schedule/search/config_searcher.cc
index 5dffb8a78cd5a..3e620d616762f 100644
--- a/paddle/cinn/ir/group_schedule/search/config_searcher.cc
+++ b/paddle/cinn/ir/group_schedule/search/config_searcher.cc
@@ -25,18 +25,18 @@ namespace search {
 
 WeightedSamplingTrailObjectiveFunc::WeightedSamplingTrailObjectiveFunc(
     ::pir::Program* program,
-    const IterSpace& iter_space,
+    const BucketInfo& bucket_info,
     double sampling_prob,
     int max_sampling_times,
     int repeats)
     : program_(program),
-      iter_space_(iter_space),
+      bucket_info_(bucket_info),
       measurer_(program),
       sampling_prob_(sampling_prob),
       max_sampling_times_(max_sampling_times),
       repeats_(repeats) {
   double weighted_space_size = 1.0;
-  for (const auto& dim : iter_space_.space) {
+  for (const auto& dim : bucket_info_.space) {
     PADDLE_ENFORCE_EQ(dim.upper_bound - dim.lower_bound + 1,
                       dim.weights.size(),
                       ::common::errors::InvalidArgument(
@@ -54,7 +54,7 @@ WeightedSamplingTrailObjectiveFunc::WeightedSamplingTrailObjectiveFunc(
   // Generate Sampling Inputs
   const auto Sample = [&]() -> std::vector<int64_t> {
     std::vector<int64_t> samples;
-    for (IterSpace::Dimension dim : iter_space_.space) {
+    for (BucketInfo::Dimension dim : bucket_info_.space) {
       int sampled = utils::SampleDiscreteFromDistribution<double>(dim.weights,
                                                                   &rand_seed_);
       samples.push_back(static_cast<int64_t>(sampled) + dim.lower_bound);
@@ -82,19 +82,15 @@ ScoreType WeightedSamplingTrailObjectiveFunc::operator()(
   auto tile_config_database = std::make_shared<NaiveTileConfigDatabase>();
   IterSpaceType iter_space_type = [&] {
     std::vector<std::pair<std::string, std::string>> res;
-    for (const auto& dim : iter_space_.space) {
+    for (const auto& dim : bucket_info_.space) {
       res.emplace_back(dim.iter_type, (dim.is_dynamic ? "dynamic" : "static"));
     }
     return res;
   }();
-  BucketInfo bucket_info{iter_space_.space[0].lower_bound,
-                         iter_space_.space[0].upper_bound,
-                         iter_space_.space[1].lower_bound,
-                         iter_space_.space[1].upper_bound};
   ScheduleConfig::TileConfig config{
       candidate[0], candidate[1], candidate[2], NoneReduceMethod()};
   tile_config_database->AddConfig(
-      cinn::common::DefaultTarget(), iter_space_type, bucket_info, config);
+      cinn::common::DefaultTarget(), bucket_info_, config);
   auto& schedule_config_manager = ScheduleConfigManager::Instance();
   schedule_config_manager.AddConfigDatabase("custom", tile_config_database);
   measurer_.Compile();
diff --git a/paddle/cinn/ir/group_schedule/search/config_searcher.h b/paddle/cinn/ir/group_schedule/search/config_searcher.h
index 082417388e8a6..4b97547db6851 100644
--- a/paddle/cinn/ir/group_schedule/search/config_searcher.h
+++ b/paddle/cinn/ir/group_schedule/search/config_searcher.h
@@ -19,6 +19,7 @@
 #include <map>
 #include <vector>
 
+#include "paddle/cinn/ir/group_schedule/config/group_tile_config.h"
 #include "paddle/cinn/ir/group_schedule/search/measurer.h"
 #include "paddle/cinn/utils/random_engine.h"
 #include "paddle/pir/include/core/program.h"
@@ -39,7 +40,7 @@ class BaseObjectiveFunc {
 class WeightedSamplingTrailObjectiveFunc : public BaseObjectiveFunc {
  public:
   WeightedSamplingTrailObjectiveFunc(::pir::Program* program,
-                                     const IterSpace& iter_space,
+                                     const BucketInfo& bucket_info,
                                      double sampling_prob = 1.0,
                                      int max_sampling_times = 65536,
                                      int repeats = 10);
@@ -48,7 +49,7 @@ class WeightedSamplingTrailObjectiveFunc : public BaseObjectiveFunc {
 
  private:
   ::pir::Program* program_;
-  IterSpace iter_space_;
+  BucketInfo bucket_info_;
   Measurer measurer_;
   double sampling_prob_;
   int max_sampling_times_;
diff --git a/paddle/cinn/ir/group_schedule/search/measurer.cc b/paddle/cinn/ir/group_schedule/search/measurer.cc
index 1934ebea16b36..ea2fa18dcadbb 100644
--- a/paddle/cinn/ir/group_schedule/search/measurer.cc
+++ b/paddle/cinn/ir/group_schedule/search/measurer.cc
@@ -35,17 +35,6 @@ namespace cinn {
 namespace ir {
 namespace search {
 
-std::string IterSpace::ToString() const {
-  std::stringstream ss;
-  ss << "IterSpace: [";
-  for (const auto& dim : space) {
-    ss << dim.iter_type << "(" << dim.lower_bound << " - " << dim.upper_bound
-       << "), ";
-  }
-  ss << "]";
-  return ss.str();
-}
-
 std::shared_ptr<pir::PassManager> CreatePassManager() {
   pir::IrContext* ctx = pir::IrContext::Instance();
   ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
diff --git a/paddle/cinn/ir/group_schedule/search/measurer.h b/paddle/cinn/ir/group_schedule/search/measurer.h
index 76de4b6eb065b..4118c40558b55 100644
--- a/paddle/cinn/ir/group_schedule/search/measurer.h
+++ b/paddle/cinn/ir/group_schedule/search/measurer.h
@@ -30,19 +30,6 @@ namespace cinn {
 namespace ir {
 namespace search {
 
-struct IterSpace {
-  struct Dimension {
-    int lower_bound;
-    int upper_bound;
-    std::string iter_type;
-    bool is_dynamic;
-    std::vector<double> weights;
-  };
-  std::vector<Dimension> space;
-
-  std::string ToString() const;
-};
-
 struct MeasureResult {
   ::common::TimeDuration compile_time;
   ::common::TimeDuration avg_kernel_execute_time;
diff --git a/paddle/cinn/ir/group_schedule/tactic/tile_first_general_tactic.cc b/paddle/cinn/ir/group_schedule/tactic/tile_first_general_tactic.cc
index a807699f330d2..c1860723cf0b1 100644
--- a/paddle/cinn/ir/group_schedule/tactic/tile_first_general_tactic.cc
+++ b/paddle/cinn/ir/group_schedule/tactic/tile_first_general_tactic.cc
@@ -20,9 +20,6 @@
 #include "paddle/cinn/ir/ir_analyzer/ir_analyzer.h"
 #include "paddle/cinn/ir/schedule/ir_schedule_util.h"
 
-PD_DECLARE_bool(support_reduce_stride_read);
-PD_DECLARE_bool(support_trivial_stride_read);
-
 namespace cinn {
 namespace ir {
 
@@ -47,11 +44,26 @@ bool IsWarpReduce(const ScheduleConfig& config) {
   return std::visit(MatchWarpReduce, config.tile_config.reduce_method);
 }
 
+bool UseReduceTile(const ScheduleConfig& config) {
+  const auto& raw_reduce_axis = config.base_info->raw_reduce_axis;
+  const auto raw_data_rank = config.base_info->raw_data_rank;
+  if (raw_reduce_axis.empty()) {
+    return false;
+  }
+  for (size_t i = 1; i < raw_reduce_axis.size(); i++) {
+    if (raw_reduce_axis[i] != raw_reduce_axis[i - 1] + 1) {
+      return false;
+    }
+  }
+  return raw_reduce_axis.back() + 1 == raw_data_rank;
+}
+
 class TileFirstGeneralTactic final : public ScheduleTactic {
  public:
   void Init(ScheduleContext* context) override;
 
   void Apply(ir::IRSchedule* sch, const std::string& block_id) override;
+  void ApplyReduceTile(ir::IRSchedule* sch, const std::string& block_id);
 
   std::string TacticName() const override { return "TileFirstGeneralTactic"; }
 
@@ -98,6 +110,11 @@ void TileFirstGeneralTactic::Init(ScheduleContext* context) {
 
 void TileFirstGeneralTactic::Apply(ir::IRSchedule* sch,
                                    const std::string& block_id) {
+  if (UseReduceTile(context_->config)) {
+    VLOG(4) << "Using ApplyReduceTile";
+    ApplyReduceTile(sch, block_id);
+    return;
+  }
   if (ir::IsReduceInitTensorName(block_id)) return;
   MergeReduceAxis(sch, block_id);
   VLOG(6) << "After MergeReduceAxis on block: [" << block_id
@@ -136,6 +153,106 @@ void TileFirstGeneralTactic::Apply(ir::IRSchedule* sch,
   SetReduceType(sch, block_id);
 }
 
+void TileFirstGeneralTactic::ApplyReduceTile(ir::IRSchedule* sch,
+                                             const std::string& block_id) {
+  if (ir::IsReduceInitTensorName(block_id)) return;
+
+  const auto sp_thread = context_->config.tile_config.warp_num * 32 /
+                         context_->config.tile_config.tree_reduce_num;
+  const auto sp_loop = context_->config.tile_config.spatial_inner_num;
+  const auto rd_thread = context_->config.tile_config.tree_reduce_num;
+  VLOG(4) << "ApplyReduceTile sp_thread=" << sp_thread;
+  VLOG(4) << "ApplyReduceTile sp_loop=" << sp_loop;
+  VLOG(4) << "ApplyReduceTile rd_thread=" << rd_thread;
+  VLOG(4) << "ApplyReduceTile vec_flatten_axis: "
+          << utils::Join(vec_flatten_axis_, ", ");
+  VLOG(4) << "ApplyReduceTile vec_reduce_axis: "
+          << utils::Join(vec_reduce_axis_, ", ");
+
+  // Merge reduce axes
+  MergeReduceAxis(sch, block_id);
+  VLOG(4) << "After MergeReduceAxis on block: [" << block_id
+          << "], loop nest:\n"
+          << sch->GetModule().GetExprs().front();
+
+  // Merge spatial axes
+  MergeFlattenAxis(sch, block_id);
+  VLOG(4) << "After MergeFlattenAxis on block: [" << block_id
+          << "], loop nest:\n"
+          << sch->GetModule().GetExprs().front();
+
+  // Split spatial axes -> [sp_block, sp_loop, sp_thread]
+  int current_reduce_axis = 0;
+  if (vec_flatten_axis_.size() > 0) {
+    auto loops = sch->GetLoops(block_id);
+    if (sp_loop > 1 && sp_thread > 1) {
+      sch->Split(loops[0], {-1, sp_loop, sp_thread});
+      current_reduce_axis = 3;
+    } else if (sp_loop > 1 || sp_thread > 1) {
+      sch->Split(loops[0], {-1, sp_loop > 1 ? sp_loop : sp_thread});
+      current_reduce_axis = 2;
+    } else {
+      current_reduce_axis = 1;
+    }
+  }
+  VLOG(4) << "After SplitSptial on block: [" << block_id << "], loop nest:\n"
+          << sch->GetModule().GetExprs().front();
+
+  // Split reduce axes -> [rd_loop, rd_thread]
+  if (vec_reduce_axis_.size() > 0) {
+    auto loops = sch->GetLoops(block_id);
+    auto reduce_loop = loops[current_reduce_axis].As<ir::For>();
+    sch->Split(loops[current_reduce_axis], {-1, rd_thread});
+    VLOG(4) << "Before ReorderReduction on block: [" << block_id
+            << "], loop nest:\n"
+            << sch->GetModule().GetExprs().front();
+
+    // TODO(lshpku): the Reorder is unneeded if the later FactorizeReduction
+    // supports rf_axis=1.
+    loops = sch->GetLoops(block_id);
+    sch->Reorder({loops[current_reduce_axis + 1], loops[current_reduce_axis]});
+    VLOG(4) << "Before FactorizeReduction on block: [" << block_id
+            << "], loop nest:\n"
+            << sch->GetModule().GetExprs().front();
+
+    if (IsReduceBlock(context_->config, block_id)) {
+      loops = sch->GetLoops(block_id);
+      sch->FactorizeReduction(loops[current_reduce_axis],
+                              /* rf_axis = */ 0,
+                              /* with_write_back_block_init = */ false);
+    }
+  }
+  VLOG(4) << "After SplitReduce on block: [" << block_id << "], loop nest:\n"
+          << sch->GetModule().GetExprs().front();
+
+  // Bind CUDA info
+  const auto DoBind = [&](const std::vector<ir::Expr>& loops) {
+    std::string sp_axis_type = "threadIdx.y";
+    std::string rd_axis_type = "threadIdx.x";
+    sch->Bind(loops[0], "blockIdx.x");
+    if (!vec_flatten_axis_.empty() && sp_thread > 1) {
+      if (vec_reduce_axis_.empty()) {
+        sch->Bind(loops[current_reduce_axis - 1], rd_axis_type);
+      } else {
+        sch->Bind(loops[current_reduce_axis - 1], sp_axis_type);
+      }
+    }
+    if (!vec_reduce_axis_.empty() && current_reduce_axis > 0) {
+      sch->Bind(loops[current_reduce_axis], rd_axis_type);
+    }
+  };
+  DoBind(sch->GetLoops(block_id));
+  if (IsReduceBlock(context_->config, block_id) &&
+      sch->HasBlock(block_id + "_rf")) {
+    DoBind(sch->GetLoops(block_id + "_rf"));
+  }
+  VLOG(4) << "After BindCudaInfo on block: [" << block_id << "], loop nest:\n"
+          << sch->GetModule().GetExprs().front();
+
+  VariableTypeAssignment(sch, block_id);
+  SetReduceType(sch, block_id);
+}
+
 void TileFirstGeneralTactic::MergeFlattenAxis(ir::IRSchedule* sch,
                                               const std::string& block_id) {
   if (vec_flatten_axis_.size() >= 2) {
@@ -167,22 +284,13 @@ void TileFirstGeneralTactic::MergeReduceAxis(ir::IRSchedule* sch,
 void TileFirstGeneralTactic::SplitSptialInner(ir::IRSchedule* sch,
                                               const std::string& block_id) {
   if (IsInnerThreadSpatialLoopGT(context_->config, 1)) {
-    if (FLAGS_support_trivial_stride_read) {
-      auto loops = sch->GetLoops(block_id);
-      std::vector<int> split_factors{
-          static_cast<int>(context_->config.tile_config.spatial_inner_num), -1};
-      sch->Split(loops[0], split_factors);
-      loops = sch->GetLoops(block_id);
-      sch->Reorder({loops[1], loops[0]});
-    } else {
-      auto loops = sch->GetLoops(block_id);
-      auto split_loops = sch->Split(
-          loops[0],
-          std::vector<int>(
-              {-1,
-               static_cast<int>(
-                   context_->config.tile_config.spatial_inner_num)}));
-    }
+    auto loops = sch->GetLoops(block_id);
+    auto split_loops =
+        sch->Split(loops[0],
+                   std::vector<int>(
+                       {-1,
+                        static_cast<int>(
+                            context_->config.tile_config.spatial_inner_num)}));
   }
 }
 
@@ -193,30 +301,9 @@ void TileFirstGeneralTactic::SplitReduceInner(ir::IRSchedule* sch,
   auto loops = sch->GetLoops(block_id);
   auto reduce_loop = loops[reduce_current_axis_].As<ir::For>();
 
-  if (FLAGS_support_reduce_stride_read) {
-    if (context_->config.base_info->reduce_numel <= 256) {
-      std::vector<int> split_factors{
-          -1, static_cast<int>(context_->config.tile_config.tree_reduce_num)};
-      sch->Split(loops[reduce_current_axis_], split_factors);
-      loops = sch->GetLoops(block_id);
-      sch->Reorder(
-          {loops[reduce_current_axis_ + 1], loops[reduce_current_axis_]});
-    } else {
-      // split warp num first
-      std::vector<int> split_factors{
-          static_cast<int>(context_->config.tile_config.warp_num), -1, 32};
-      sch->Split(loops[reduce_current_axis_], split_factors);
-      loops = sch->GetLoops(block_id);
-      sch->Reorder(
-          {loops[reduce_current_axis_ + 2], loops[reduce_current_axis_ + 1]});
-      loops = sch->GetLoops(block_id);
-      sch->Fuse({loops[reduce_current_axis_], loops[reduce_current_axis_ + 1]});
-    }
-  } else {
-    std::vector<int> split_factors{
-        static_cast<int>(context_->config.tile_config.tree_reduce_num), -1};
-    sch->Split(loops[reduce_current_axis_], split_factors);
-  }
+  std::vector<int> split_factors{
+      static_cast<int>(context_->config.tile_config.tree_reduce_num), -1};
+  sch->Split(loops[reduce_current_axis_], split_factors);
   loops = sch->GetLoops(block_id);
   if (IsReduceBlock(context_->config, block_id)) {
     sch->FactorizeReduction(loops[reduce_current_axis_],
diff --git a/paddle/cinn/ir/group_schedule/tactic/tile_tactic.cc b/paddle/cinn/ir/group_schedule/tactic/tile_tactic.cc
index 0aaf620874568..adf979c7a7fd4 100644
--- a/paddle/cinn/ir/group_schedule/tactic/tile_tactic.cc
+++ b/paddle/cinn/ir/group_schedule/tactic/tile_tactic.cc
@@ -52,7 +52,14 @@ void TileTactic::Init(ScheduleContext* context) {
       int64_t extent = static_cast<int64_t>(total_rb_extent.get_constant());
       nums_thread_per_block = GetFirstFactor(extent);
     } else {
-      nums_thread_per_block = context_->bucket_info.rb_lower_bound;
+      if (context->bucket_info.space.size() == 2 &&
+          context->bucket_info.space[1].iter_type == "R") {
+        nums_thread_per_block = context_->bucket_info.space[1].lower_bound;
+      } else {
+        PADDLE_THROW(::common::errors::Unimplemented(
+            "Now, the function GetTreeReduceSize doesn't support the cases "
+            "except SR"));
+      }
     }
     return nums_thread_per_block > max_num_threads ? max_num_threads
                                                    : nums_thread_per_block;
@@ -95,9 +102,17 @@ void TileTactic::Init(ScheduleContext* context) {
     // other bound to cuda thread.
     context_->iter_space_info.sp_space.emplace_back(
         ir::Expr(-1), IterativeSpaceInfo::AxisType::kCudaBlockX);
-    context_->iter_space_info.sp_space.emplace_back(
-        ir::Expr(GetNumThreadPerBlock(context_->bucket_info.rb_upper_bound)),
-        IterativeSpaceInfo::AxisType::kCudaThreadX);
+    if (context->bucket_info.space.size() == 2 &&
+        context->bucket_info.space[1].iter_type == "R") {
+      context_->iter_space_info.sp_space.emplace_back(
+          ir::Expr(
+              GetNumThreadPerBlock(context_->bucket_info.space[1].upper_bound)),
+          IterativeSpaceInfo::AxisType::kCudaThreadX);
+    } else {
+      PADDLE_THROW(::common::errors::Unimplemented(
+          "Now, the function GetTreeReduceSize doesn't support the cases "
+          "except SR"));
+    }
   }
   VLOG(6) << context_->iter_space_info.PrintIterSpace();
 }
diff --git a/paddle/cinn/ir/ir.cc b/paddle/cinn/ir/ir.cc
index 1b9c83913112d..6d658ed30cc27 100644
--- a/paddle/cinn/ir/ir.cc
+++ b/paddle/cinn/ir/ir.cc
@@ -26,6 +26,7 @@
 #include "paddle/cinn/ir/module.h"
 #include "paddle/cinn/ir/tensor.h"
 #include "paddle/cinn/optim/ir_simplify.h"
+#include "paddle/common/errors.h"
 
 namespace cinn {
 namespace ir {
@@ -255,6 +256,7 @@ Expr For::Make(Var loop_var,
                Expr body,
                VectorizeInfo vector_info,
                BindInfo bind_info) {
+  ir::TryElevateInt32ToInt64({loop_var, min, extent});
   auto node = make_shared<For>();
   CHECK(loop_var.defined());
   CHECK(min.defined());
@@ -884,9 +886,21 @@ void For::Verify() const {
   CHECK(extent.defined());
   CHECK(body.defined());
 
-  CHECK_EQ(loop_var->type(), type_of<int32_t>());
-  CHECK_EQ(min->type(), type_of<int32_t>());
-  CHECK_EQ(extent->type(), type_of<int32_t>());
+  PADDLE_ENFORCE_EQ((loop_var->type() == type_of<int32_t>()) ||
+                        (loop_var->type() == type_of<int64_t>()),
+                    true,
+                    ::common::errors::InvalidArgument(
+                        "loop var's type must be int32 or int64"));
+  PADDLE_ENFORCE_EQ((min->type() == type_of<int32_t>()) ||
+                        (min->type() == type_of<int64_t>()),
+                    true,
+                    ::common::errors::InvalidArgument(
+                        "loop min's type must be int32 or int64"));
+  PADDLE_ENFORCE_EQ((extent->type() == type_of<int32_t>()) ||
+                        (extent->type() == type_of<int64_t>()),
+                    true,
+                    ::common::errors::InvalidArgument(
+                        "loop extent's type must be int32 or int64"));
 }
 
 void PolyFor::Verify() const {
diff --git a/paddle/cinn/ir/ir_base.h b/paddle/cinn/ir/ir_base.h
index eeba03a0978ea..84e14cc839c15 100644
--- a/paddle/cinn/ir/ir_base.h
+++ b/paddle/cinn/ir/ir_base.h
@@ -402,6 +402,11 @@ struct UnaryOpNode : public ExprNode<T> {
     return v().type();
   }
 
+  void replace(Expr old_op, Expr new_op) {
+    if (v() == old_op) {
+      v() = new_op;
+    }
+  }
   Expr& v() { return operands().front(); }
   const Expr& v() const { return operands().front(); }
 
diff --git a/paddle/cinn/ir/schedule/impl/base.cc b/paddle/cinn/ir/schedule/impl/base.cc
index 24583a67374e7..e68a5396578b0 100644
--- a/paddle/cinn/ir/schedule/impl/base.cc
+++ b/paddle/cinn/ir/schedule/impl/base.cc
@@ -92,7 +92,7 @@ void DyScheduleImpl::MergeExprs() {
     }
   }
   for (auto& block : merged_block) {
-    VLOG(3) << "in merged_block, it has " << block;
+    VLOG(3) << "in merged_block, it has \n" << block;
   }
   auto merged_expr = ir::Block::Make(merged_block);
   exprs[0]
diff --git a/paddle/cinn/ir/schedule/schedule_desc.proto b/paddle/cinn/ir/schedule/schedule_desc.proto
index 829478cf22dd4..ed6d8bef92dbb 100644
--- a/paddle/cinn/ir/schedule/schedule_desc.proto
+++ b/paddle/cinn/ir/schedule/schedule_desc.proto
@@ -1,11 +1,11 @@
 // Copyright (c) 2022 CINN Authors. All Rights Reserved.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
-// 
+//
 //     http://www.apache.org/licenses/LICENSE-2.0
-// 
+//
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/paddle/cinn/ir/test/tensor_test.cc b/paddle/cinn/ir/test/tensor_test.cc
index 4bf64f309735e..143b5fcd4d18a 100644
--- a/paddle/cinn/ir/test/tensor_test.cc
+++ b/paddle/cinn/ir/test/tensor_test.cc
@@ -59,7 +59,7 @@ function func_C (_A, _B, _D)
   {
     serial for (j, 0, 20)
     {
-      D[i, j] = (1.00000000f + ((2.00000000f * A[i, j]) + (2.00000000f * B[i, j])))
+      D[i, j] = (((A[i, j] + B[i, j]) * 2.00000000f) + 1.00000000f)
     }
   }
 }
@@ -117,7 +117,7 @@ void fn(void* _args, int32_t num_args)
   for (int32_t i = 0; i < 10; i += 1) {
     for (int32_t j = 0; j < 10; j += 1) {
       for (int32_t k = 0; k < 100; k += 1) {
-        B[((1000 * i) + ((100 * j) + k))] = (2.00000000f * A_reshape[((1000 * i) + ((100 * j) + k))]);
+        B[((1000 * i) + ((100 * j) + k))] = (A_reshape[((1000 * i) + ((100 * j) + k))] * 2.00000000f);
       };
     };
   };
@@ -175,7 +175,7 @@ void fn(void* _args, int32_t num_args)
   for (int32_t i = 0; i < 10; i += 1) {
     for (int32_t j = 0; j < 10; j += 1) {
       for (int32_t k = 0; k < 100; k += 1) {
-        B[((1000 * i) + ((100 * j) + k))] = (2.00000000f * A_copied_reshape[((1000 * i) + ((100 * j) + k))]);
+        B[((1000 * i) + ((100 * j) + k))] = (A_copied_reshape[((1000 * i) + ((100 * j) + k))] * 2.00000000f);
       };
     };
   };
diff --git a/paddle/cinn/lang/lower_test.cc b/paddle/cinn/lang/lower_test.cc
index 25b0bb20f1956..abb9f96b6dd72 100644
--- a/paddle/cinn/lang/lower_test.cc
+++ b/paddle/cinn/lang/lower_test.cc
@@ -53,7 +53,7 @@ TEST(lower, basic) {
   {
     serial for (j, 0, 15)
     {
-      B[i, j] = (1.00000000f + A[i, j])
+      B[i, j] = (A[i, j] + 1.00000000f)
     }
   }
 }
diff --git a/paddle/cinn/optim/cache_read_write_replace_test.cc b/paddle/cinn/optim/cache_read_write_replace_test.cc
index 86206d8515287..2769b7913eb00 100755
--- a/paddle/cinn/optim/cache_read_write_replace_test.cc
+++ b/paddle/cinn/optim/cache_read_write_replace_test.cc
@@ -105,7 +105,7 @@ function fn (_A, _B, _C1_write_cache)
   {
     serial for (j, 0, 100)
     {
-      C1_write_cache[i, j] = (3.00000000f + A[i, j])
+      C1_write_cache[i, j] = (((A[i, j] + 1.00000000f) + 1.00000000f) + 1.00000000f)
     }
   }
   serial for (i, 0, 100)
diff --git a/paddle/cinn/optim/eliminate_common_factor_of_local_index.cc b/paddle/cinn/optim/eliminate_common_factor_of_local_index.cc
index 85f8153bb65d4..362e6bff8a113 100644
--- a/paddle/cinn/optim/eliminate_common_factor_of_local_index.cc
+++ b/paddle/cinn/optim/eliminate_common_factor_of_local_index.cc
@@ -357,10 +357,10 @@ void EliminateCommonFactorHelper(ir::Expr* expr) {
 }
 
 void EliminateCommonFactorOfLocalIndex(ir::Expr* expr) {
-  VLOG(2) << "Before EliminateCommonFactorOfLocalIndex, Expr = \n" << *expr;
+  VLOG(4) << "Before EliminateCommonFactorOfLocalIndex, Expr = \n" << *expr;
   EliminateCommonFactorHelper<Gcd>(expr);
   EliminateCommonFactorHelper<Offset>(expr);
-  VLOG(2) << "After EliminateCommonFactorOfLocalIndex, Expr = \n" << *expr;
+  VLOG(4) << "After EliminateCommonFactorOfLocalIndex, Expr = \n" << *expr;
 }
 
 }  // namespace optim
diff --git a/paddle/cinn/optim/ir_simplify_test.cc b/paddle/cinn/optim/ir_simplify_test.cc
index fd2b5be74d062..561b60119f078 100755
--- a/paddle/cinn/optim/ir_simplify_test.cc
+++ b/paddle/cinn/optim/ir_simplify_test.cc
@@ -41,7 +41,7 @@ TEST(IrSimplify, basic) {
     // get (((C[(i * 20)] + 0) + 100) + 24.5)
     Simplify(&B);
     LOG(INFO) << "simplified: " << B;
-    auto out = "(124.500000f + C[i, 0])";
+    auto out = "(((C[i, 0] + 0.00000000f) + 100.000000f) + 24.5000000f)";
     EXPECT_EQ(out, utils::GetStreamCnt(B));
   }
 
@@ -69,7 +69,7 @@ TEST(IrSimplify, basic) {
   {
     serial for (j, 0, 20)
     {
-      B[i, j] = (125.000000f + (X[i, j] + y[i, 0]))
+      B[i, j] = ((((((X[i, j] + (y[i, 0] * 1.00000000f)) + (0.00000000f * X[i, j])) + 25.0000000f) + 100.000000f) - 0.00000000f) + 0.00000000f)
     }
   }
 }
@@ -104,7 +104,7 @@ TEST(IrSimplify, basic) {
   {
     serial for (j, 0, 20)
     {
-      B[i, j] = ((y[i, 0] / 3.00000000f) + (125.000000f + X[(1000 * i), 0]))
+      B[i, j] = ((((((X[(1000 * i), 0] + (y[i, 0] / 3.00000000f)) + (0.00000000f * X[i, j])) + 25.0000000f) + 100.000000f) - 0.00000000f) + 0.00000000f)
     }
   }
 }
diff --git a/paddle/cinn/optim/optimize_test.cc b/paddle/cinn/optim/optimize_test.cc
index bd1515fd7924c..db667a61cd86f 100755
--- a/paddle/cinn/optim/optimize_test.cc
+++ b/paddle/cinn/optim/optimize_test.cc
@@ -41,11 +41,11 @@ TEST(Optimize, Unroll) {
   {
     serial for (j_outer, 0, 4)
     {
-      C[i, (5 * j_outer)] = (1.00000000f + A[i, (5 * j_outer)])
-      C[i, (1 + (5 * j_outer))] = (1.00000000f + A[i, (1 + (5 * j_outer))])
-      C[i, (2 + (5 * j_outer))] = (1.00000000f + A[i, (2 + (5 * j_outer))])
-      C[i, (3 + (5 * j_outer))] = (1.00000000f + A[i, (3 + (5 * j_outer))])
-      C[i, (4 + (5 * j_outer))] = (1.00000000f + A[i, (4 + (5 * j_outer))])
+      C[i, (5 * j_outer)] = (A[i, (5 * j_outer)] + 1.00000000f)
+      C[i, (1 + (5 * j_outer))] = (A[i, (1 + (5 * j_outer))] + 1.00000000f)
+      C[i, (2 + (5 * j_outer))] = (A[i, (2 + (5 * j_outer))] + 1.00000000f)
+      C[i, (3 + (5 * j_outer))] = (A[i, (3 + (5 * j_outer))] + 1.00000000f)
+      C[i, (4 + (5 * j_outer))] = (A[i, (4 + (5 * j_outer))] + 1.00000000f)
     }
   }
 }
diff --git a/paddle/cinn/optim/resize_buffer.cc b/paddle/cinn/optim/resize_buffer.cc
index 2ec4e172b3fc7..7faba111c2521 100644
--- a/paddle/cinn/optim/resize_buffer.cc
+++ b/paddle/cinn/optim/resize_buffer.cc
@@ -249,6 +249,7 @@ class ResizeBufferFromAnalyzedRange : public ir::IRMutator<> {
     ir::Store* store = expr->As<ir::Store>();
     ir::Tensor tensor = store->tensor.as_tensor_ref();
     ResizeTensor(&tensor);
+    ReplaceTensorIndices<ir::Store>(store);
     ir::IRMutator<>::Visit(op, expr);
   }
 
@@ -264,11 +265,8 @@ class ResizeBufferFromAnalyzedRange : public ir::IRMutator<> {
       return;
     }
 
-    const std::string& buffer_name = load->tensor.as_tensor_ref()->buffer->name;
-    if (buffer_name_to_shape_.count(buffer_name) > 0) {
-      load->tensor.as_tensor_ref()->shape =
-          buffer_name_to_shape_.at(buffer_name);
-    }
+    ir::Tensor tensor = load->tensor.as_tensor_ref();
+    ResizeTensor(&tensor);
 
     // For the moment, align the load tensor indices with the tensor shape using
     // the trick method. A better way would be to modify the FlattenLoop
@@ -277,6 +275,7 @@ class ResizeBufferFromAnalyzedRange : public ir::IRMutator<> {
     for (int i = 0; i < cnt; i++) {
       load->indices.erase(load->indices.begin());
     }
+    ReplaceTensorIndices<ir::Load>(load);
     ir::IRMutator<>::Visit(op, expr);
   }
 
@@ -304,6 +303,35 @@ class ResizeBufferFromAnalyzedRange : public ir::IRMutator<> {
     }
   }
 
+  template <typename T>
+  void ReplaceTensorIndices(T* op) {
+    ir::Tensor tensor = op->tensor.as_tensor_ref();
+    ir::Buffer buffer = tensor->buffer;
+    if (!buffer.defined()) return;
+    if (buffer->memory_type != ir::MemoryType::GPULocal) return;
+
+    VLOG(4) << "replacing index of tensor: " << tensor->name;
+    ir::Expr index_expr = op->index();
+    std::unordered_map<std::string, ir::Expr> var_name_to_expr;
+    ir::ir_utils::CollectIRNodes(index_expr, [&](const ir::Expr* x) {
+      const ir::_Var_* var = x->as_var();
+      if (var) {
+        var_name_to_expr[var->name] = var->Copy();
+      }
+      return false;
+    });
+    if (var_name_to_expr.size() != 1) {
+      return;
+    }
+
+    ir::Expr single_var = var_name_to_expr.begin()->second;
+    VLOG(4) << "found single var: " << single_var;
+    for (size_t i = 0; i + 1 < op->indices.size(); i++) {
+      op->indices[i] = ir::Expr(0);
+    }
+    op->indices.back() = single_var;
+  }
+
  private:
   const std::unordered_map<std::string, std::vector<ir::Expr>>&
       buffer_name_to_shape_;
diff --git a/paddle/cinn/optim/transform_gpu_forloop.cc b/paddle/cinn/optim/transform_gpu_forloop.cc
index 4e5d5f4c5ae8e..5d4629436d7e6 100644
--- a/paddle/cinn/optim/transform_gpu_forloop.cc
+++ b/paddle/cinn/optim/transform_gpu_forloop.cc
@@ -426,7 +426,7 @@ class ReplaceVarToZero : public ir::IRMutator<> {
 };
 
 void OptimizeExprGPU(Expr *expr) {
-  VLOG(2) << "Before Optimize Expr:\n" << *expr;
+  VLOG(4) << "Before Optimize Expr:\n" << *expr;
 
   // copy var nodes to prevent one modification leading to multiple changes
   RestructureVarNodes restructure_var_nodes;
@@ -458,7 +458,7 @@ void OptimizeExprGPU(Expr *expr) {
   ReplaceVarToZero replace_var_to_zero;
   replace_var_to_zero(expr);
 
-  VLOG(2) << "After Optimize Expr: \n" << *expr;
+  VLOG(4) << "After Optimize Expr: \n" << *expr;
 }
 
 }  // namespace optim
diff --git a/paddle/cinn/poly/schedule_test.cc b/paddle/cinn/poly/schedule_test.cc
index af63ee34f4334..23710c93d4256 100755
--- a/paddle/cinn/poly/schedule_test.cc
+++ b/paddle/cinn/poly/schedule_test.cc
@@ -48,7 +48,7 @@ TEST(CreateStages, compute_at) {
   {
     serial for (j, 0, 100)
     {
-      B[i, j] = (1.00000000f + A[i, j])
+      B[i, j] = (A[i, j] + 1.00000000f)
       serial for (k, 0, 100)
       {
         C[i, j, k] = (B[i, j] * B[j, k])
@@ -99,21 +99,21 @@ TEST(CreateStages, buffer_bind_to_multiple_tensors_schedule) {
   {
     serial for (j, 0, 100)
     {
-      B[i, j] = (1.00000000f + A[i, j])
+      B[i, j] = (A[i, j] + 1.00000000f)
     }
   }
   serial for (i, 0, 100)
   {
     serial for (j, 0, 100)
     {
-      C[i, j] = (1.00000000f + A[i, j])
+      C[i, j] = (A[i, j] + 1.00000000f)
     }
   }
   serial for (i, 0, 100)
   {
     serial for (j, 0, 100)
     {
-      D[i, j] = (1.00000000f + A[i, j])
+      D[i, j] = (A[i, j] + 1.00000000f)
     }
   }
 }
diff --git a/paddle/cinn/poly/stage_test.cc b/paddle/cinn/poly/stage_test.cc
index e8cbf9dd8ff87..2c01b9b9de617 100644
--- a/paddle/cinn/poly/stage_test.cc
+++ b/paddle/cinn/poly/stage_test.cc
@@ -207,7 +207,7 @@ function fn (_A, _A1, _B)
       }
       serial for (j, 0, 32)
       {
-        B[((16 * i_outer) + i_inner), j] = (A1[((16 * i_outer) + i_inner), j] + (A1[(1 + ((16 * i_outer) + i_inner)), j] + A1[(2 + ((16 * i_outer) + i_inner)), j]))
+        B[((16 * i_outer) + i_inner), j] = ((A1[((16 * i_outer) + i_inner), j] + A1[(1 + ((16 * i_outer) + i_inner)), j]) + A1[(2 + ((16 * i_outer) + i_inner)), j])
       }
     }
   }
@@ -431,7 +431,7 @@ function fn (_A, _C)
   {
     serial for (j, 0, 200)
     {
-      C[i, j] = (6.00000000f + (2.00000000f * A[i, j]))
+      C[i, j] = ((((A[i, j] + 1.00000000f) + 1.00000000f) + 1.00000000f) * 2.00000000f)
     }
   }
 }
@@ -475,21 +475,21 @@ function fn (_A, _C, _C1, _C2)
   {
     serial for (j, 0, 200)
     {
-      C2[i, j] = (6.00000000f + (2.00000000f * A[i, j]))
+      C2[i, j] = ((((A[i, j] + 1.00000000f) + 1.00000000f) + 1.00000000f) * 2.00000000f)
     }
   }
   serial for (i, 0, 100)
   {
     serial for (j, 0, 200)
     {
-      C1[i, j] = (4.00000000f + (2.00000000f * A[i, j]))
+      C1[i, j] = (((A[i, j] + 1.00000000f) + 1.00000000f) * 2.00000000f)
     }
   }
   serial for (i, 0, 100)
   {
     serial for (j, 0, 200)
     {
-      C[i, j] = (2.00000000f + (2.00000000f * A[i, j]))
+      C[i, j] = ((A[i, j] + 1.00000000f) * 2.00000000f)
     }
   }
 }
diff --git a/paddle/cinn/pybind/backends.cc b/paddle/cinn/pybind/backends.cc
index 4e589380223df..a0f51bc88aad8 100644
--- a/paddle/cinn/pybind/backends.cc
+++ b/paddle/cinn/pybind/backends.cc
@@ -61,7 +61,10 @@ void BindExecutionEngine(py::module *m) {
                &ExecutionEngine::Create)),
            py::arg("options") = ExecutionOptions())
       .def("lookup", lookup)
-      .def("link", &ExecutionEngine::Link);
+      .def("link",
+           &ExecutionEngine::Link,
+           py::arg("module"),
+           py::arg("add_module") = true);
 
   {
     auto lookup = [](Compiler &self, absl::string_view name) {
diff --git a/paddle/cinn/runtime/flags.cc b/paddle/cinn/runtime/flags.cc
index a0250b0174c52..a76bb16706c52 100644
--- a/paddle/cinn/runtime/flags.cc
+++ b/paddle/cinn/runtime/flags.cc
@@ -88,14 +88,6 @@ PD_DEFINE_bool(group_schedule_tiling_first,
                BoolFromEnv("FLAGS_group_schedule_tiling_first", false),
                "Whether to enable new group scheduler tiling first strategy.");
 
-PD_DEFINE_bool(support_reduce_stride_read,
-               BoolFromEnv("FLAGS_support_reduce_stride_read", false),
-               "Whether to enable stride read in reduced dim.");
-
-PD_DEFINE_bool(support_trivial_stride_read,
-               BoolFromEnv("FLAGS_support_trivial_stride_read", false),
-               "Whether to enable stride read in trivial dim.");
-
 PD_DEFINE_bool(cinn_use_common_subexpression_elimination,
                BoolFromEnv("FLAGS_cinn_use_common_subexpression_elimination",
                            false),
diff --git a/paddle/common/flags.cc b/paddle/common/flags.cc
index 9e4da57143980..1b888aeed0d66 100644
--- a/paddle/common/flags.cc
+++ b/paddle/common/flags.cc
@@ -1453,6 +1453,10 @@ PHI_DEFINE_EXPORTED_bool(logging_trunc_pir_py_code,
                          "whether truncate the logging files under directory "
                          "FLAGS_logging_pir_py_code_dir");
 
+PHI_DEFINE_EXPORTED_bool(logging_pir_py_code_dump_symbolic_dims,
+                         false,
+                         "whether dump symbolic dims into pir py code.");
+
 /**
  * Using PIR API in Python
  * Name: enable_pir_api
@@ -1612,6 +1616,11 @@ PHI_DEFINE_EXPORTED_bool(pir_apply_shape_optimization_pass,
                          "Whether to apply shape_optimization pass "
                          "to infer symbolic shape");
 
+PHI_DEFINE_EXPORTED_int64(
+    pir_broadcast_tree_limit,
+    32,
+    "Maximum number of broadcast nodes allowed in a tree");
+
 PHI_DEFINE_EXPORTED_string(
     nvidia_package_dir,  // NOLINT
     "",
diff --git a/paddle/common/flags_native.cc b/paddle/common/flags_native.cc
index 5801b32667d6f..12af71499dec2 100644
--- a/paddle/common/flags_native.cc
+++ b/paddle/common/flags_native.cc
@@ -25,8 +25,7 @@
 #include <string>
 #include <vector>
 
-namespace paddle {
-namespace flags {
+namespace paddle::flags {
 
 std::stringstream& ErrorStream() {
   static std::stringstream err_ss;
@@ -554,5 +553,4 @@ INSTANTIATE_GET_FROM_ENV(std::string);
 
 #undef INSTANTIATE_GET_FROM_ENV
 
-}  // namespace flags
-}  // namespace paddle
+}  // namespace paddle::flags
diff --git a/paddle/fluid/distributed/collective/common.cc b/paddle/fluid/distributed/collective/common.cc
index e60ecf9b8dcb5..159e9bd2dfdfb 100644
--- a/paddle/fluid/distributed/collective/common.cc
+++ b/paddle/fluid/distributed/collective/common.cc
@@ -14,8 +14,7 @@
 
 #include "paddle/fluid/distributed/collective/common.h"
 
-namespace paddle {
-namespace distributed {
+namespace paddle::distributed {
 
 std::vector<Place> GetPlaceList(const std::vector<phi::DenseTensor>& tensors) {
   std::vector<Place> places;
@@ -65,5 +64,4 @@ bool CheckTensorsInXPUPlace(const std::vector<phi::DenseTensor>& tensors) {
       });
 }
 
-}  //  namespace distributed
-}  //  namespace paddle
+}  // namespace paddle::distributed
diff --git a/paddle/fluid/distributed/collective/gloo_send_recv.cc b/paddle/fluid/distributed/collective/gloo_send_recv.cc
index 970cb6ec93dc2..c7d4b67d6dd7a 100644
--- a/paddle/fluid/distributed/collective/gloo_send_recv.cc
+++ b/paddle/fluid/distributed/collective/gloo_send_recv.cc
@@ -20,8 +20,7 @@
 #include "gloo/types.h"
 #include "paddle/fluid/distributed/collective/gloo_send_recv.h"
 
-namespace paddle {
-namespace distributed {
+namespace paddle::distributed {
 
 void send_recv(SendRecvOptions* opts) {
   const auto& context = opts->context;
@@ -38,5 +37,4 @@ void send_recv(SendRecvOptions* opts) {
   }
 }
 
-}  // namespace distributed
-}  // namespace paddle
+}  // namespace paddle::distributed
diff --git a/paddle/fluid/distributed/collective/process_group.cc b/paddle/fluid/distributed/collective/process_group.cc
index f151c041c7412..4edbe8ca0e2f6 100644
--- a/paddle/fluid/distributed/collective/process_group.cc
+++ b/paddle/fluid/distributed/collective/process_group.cc
@@ -14,8 +14,7 @@
 
 #include "paddle/fluid/distributed/collective/process_group.h"
 
-namespace paddle {
-namespace distributed {
+namespace paddle::distributed {
 
 bool ProcessGroup::Task::IsCompleted() {
   std::lock_guard<std::mutex> lock(mutex_);
@@ -53,5 +52,4 @@ void ProcessGroupIdMap::DestroyProcessGroup() {
   id_map.clear();
 }
 
-}  //  namespace distributed
-}  //  namespace paddle
+}  // namespace paddle::distributed
diff --git a/paddle/fluid/distributed/collective/process_group_gloo.cc b/paddle/fluid/distributed/collective/process_group_gloo.cc
index 283409329ea93..2b6724d75fe90 100644
--- a/paddle/fluid/distributed/collective/process_group_gloo.cc
+++ b/paddle/fluid/distributed/collective/process_group_gloo.cc
@@ -32,8 +32,7 @@
 #include "paddle/phi/api/lib/data_transform.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 
-namespace paddle {
-namespace distributed {
+namespace paddle::distributed {
 
 #ifdef _WIN32
 #define GENERATE_FUNC(type, func, ...)       \
@@ -727,5 +726,4 @@ phi::distributed::GlooCommContext* ProcessGroupGloo::GetCommContext() {
   return comm_context;
 }
 
-}  // namespace distributed
-}  // namespace paddle
+}  // namespace paddle::distributed
diff --git a/paddle/fluid/distributed/collective/process_group_nccl.cc b/paddle/fluid/distributed/collective/process_group_nccl.cc
index f98cc9ac63bf0..7e6aaba62b92c 100644
--- a/paddle/fluid/distributed/collective/process_group_nccl.cc
+++ b/paddle/fluid/distributed/collective/process_group_nccl.cc
@@ -42,8 +42,7 @@ COMMON_DECLARE_bool(enable_async_trace);
 constexpr bool FLAGS_enable_nccl_dynamic_check = false;
 constexpr int64_t kWaitBlockTImeout = 10;
 
-namespace paddle {
-namespace distributed {
+namespace paddle::distributed {
 
 using phi::distributed::CheckSizeOnEachRank;
 using phi::distributed::IsP2POP;
@@ -1045,5 +1044,4 @@ phi::distributed::NCCLCommContext* ProcessGroupNCCL::GetCommContext(
   return comm_context;
 }
 
-}  //  namespace distributed
-}  //  namespace paddle
+}  // namespace paddle::distributed
diff --git a/paddle/fluid/distributed/common/afs_warpper.cc b/paddle/fluid/distributed/common/afs_warpper.cc
index af9dcd285500b..a733b8d985a14 100644
--- a/paddle/fluid/distributed/common/afs_warpper.cc
+++ b/paddle/fluid/distributed/common/afs_warpper.cc
@@ -16,8 +16,7 @@
 
 #include "paddle/fluid/framework/io/fs.h"
 
-namespace paddle {
-namespace distributed {
+namespace paddle::distributed {
 // AfsClient impl
 int AfsClient::initialize(const FsClientParameter& fs_client_param) {
   // temporarily implemented with hdfs-client
@@ -96,5 +95,4 @@ std::vector<std::string> AfsClient::list(const std::string& path) {
 bool AfsClient::exist(const std::string& dir) {
   return paddle::framework::fs_exists(dir);
 }
-}  // namespace distributed
-}  // namespace paddle
+}  // namespace paddle::distributed
diff --git a/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.cc b/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.cc
index a166ff0b6dfa2..68cf8cd13d255 100644
--- a/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.cc
@@ -17,8 +17,7 @@
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
 #include "paddle/fluid/framework/operator.h"
 
-namespace paddle {
-namespace distributed {
+namespace paddle::distributed {
 
 AmplifierInterceptor::AmplifierInterceptor(int64_t interceptor_id,
                                            TaskNode* node)
@@ -56,5 +55,4 @@ void AmplifierInterceptor::ReplyCompletedToUpStream() {
 
 REGISTER_INTERCEPTOR(Amplifier, AmplifierInterceptor);
 
-}  // namespace distributed
-}  // namespace paddle
+}  // namespace paddle::distributed
diff --git a/paddle/fluid/distributed/fleet_executor/message_bus.cc b/paddle/fluid/distributed/fleet_executor/message_bus.cc
index c90d1503947bf..d65145f556053 100644
--- a/paddle/fluid/distributed/fleet_executor/message_bus.cc
+++ b/paddle/fluid/distributed/fleet_executor/message_bus.cc
@@ -23,8 +23,7 @@
 #include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/platform/gen_comm_id_helper.h"
 
-namespace paddle {
-namespace distributed {
+namespace paddle::distributed {
 
 void MessageBus::Init(
     int64_t rank,
@@ -250,5 +249,4 @@ bool MessageBus::SendInterRank(int64_t dst_rank,
 
 #endif
 
-}  // namespace distributed
-}  // namespace paddle
+}  // namespace paddle::distributed
diff --git a/paddle/fluid/distributed/fleet_executor/runtime_graph.cc b/paddle/fluid/distributed/fleet_executor/runtime_graph.cc
index a5f90062dcfd9..d2d46f31d2765 100644
--- a/paddle/fluid/distributed/fleet_executor/runtime_graph.cc
+++ b/paddle/fluid/distributed/fleet_executor/runtime_graph.cc
@@ -16,8 +16,7 @@
 
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
 
-namespace paddle {
-namespace distributed {
+namespace paddle::distributed {
 
 std::string RuntimeGraph::DebugString() const {
   std::ostringstream os;
@@ -29,5 +28,4 @@ std::string RuntimeGraph::DebugString() const {
   return os.str();
 }
 
-}  // namespace distributed
-}  // namespace paddle
+}  // namespace paddle::distributed
diff --git a/paddle/fluid/distributed/fleet_executor/task_loop.cc b/paddle/fluid/distributed/fleet_executor/task_loop.cc
index 270bce7786038..44e853a0d9684 100644
--- a/paddle/fluid/distributed/fleet_executor/task_loop.cc
+++ b/paddle/fluid/distributed/fleet_executor/task_loop.cc
@@ -17,8 +17,7 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/errors.h"
 
-namespace paddle {
-namespace distributed {
+namespace paddle::distributed {
 
 thread_local TaskLoop* TaskLoop::thread_local_loop_ = nullptr;
 
@@ -81,5 +80,4 @@ void TaskLoop::AbortNotInLoopThread() {
       std::this_thread::get_id()));
 }
 
-}  // namespace distributed
-}  // namespace paddle
+}  // namespace paddle::distributed
diff --git a/paddle/fluid/distributed/fleet_executor/task_loop_thread.cc b/paddle/fluid/distributed/fleet_executor/task_loop_thread.cc
index 848096eb4442f..3ec9c50c05d98 100644
--- a/paddle/fluid/distributed/fleet_executor/task_loop_thread.cc
+++ b/paddle/fluid/distributed/fleet_executor/task_loop_thread.cc
@@ -18,8 +18,7 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/errors.h"
 
-namespace paddle {
-namespace distributed {
+namespace paddle::distributed {
 
 TaskLoopThread::TaskLoopThread() : start_(false), loop_(nullptr) {}
 
@@ -56,5 +55,4 @@ void TaskLoopThread::Loop() {
   loop_ = nullptr;
 }
 
-}  // namespace distributed
-}  // namespace paddle
+}  // namespace paddle::distributed
diff --git a/paddle/fluid/distributed/fleet_executor/task_loop_thread_pool.cc b/paddle/fluid/distributed/fleet_executor/task_loop_thread_pool.cc
index 0a4e704590f0b..b0e0c498f63be 100644
--- a/paddle/fluid/distributed/fleet_executor/task_loop_thread_pool.cc
+++ b/paddle/fluid/distributed/fleet_executor/task_loop_thread_pool.cc
@@ -19,8 +19,7 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/errors.h"
 
-namespace paddle {
-namespace distributed {
+namespace paddle::distributed {
 
 TaskLoopThreadPool::TaskLoopThreadPool() : TaskLoopThreadPool(1) {}
 
@@ -73,5 +72,4 @@ std::vector<TaskLoop*> TaskLoopThreadPool::GetAllLoops() {
   return loops_;
 }
 
-}  // namespace distributed
-}  // namespace paddle
+}  // namespace paddle::distributed
diff --git a/paddle/fluid/distributed/ps.proto b/paddle/fluid/distributed/ps.proto
index 27a93a9787ff5..e7e708a2ee4f9 100644
--- a/paddle/fluid/distributed/ps.proto
+++ b/paddle/fluid/distributed/ps.proto
@@ -1,11 +1,11 @@
 // Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
-// 
+//
 //     http://www.apache.org/licenses/LICENSE-2.0
-// 
+//
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/paddle/fluid/distributed/ps/service/brpc_utils.cc b/paddle/fluid/distributed/ps/service/brpc_utils.cc
index b79dfaab3f200..22ce67c12a132 100644
--- a/paddle/fluid/distributed/ps/service/brpc_utils.cc
+++ b/paddle/fluid/distributed/ps/service/brpc_utils.cc
@@ -20,18 +20,15 @@ limitations under the License. */
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/platform/enforce.h"
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 class Variable;
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
 
 namespace phi {
 class DenseTensor;
 }  // namespace phi
 
-namespace paddle {
-namespace distributed {
+namespace paddle::distributed {
 
 framework::proto::VarType::Type VarMessageToVarType(
     VariableMessage::Type type) {
@@ -343,5 +340,4 @@ std::string GetIntTypeEndpoint(const std::string& ip, const uint32_t& port) {
   return int_ip_port;
 }
 
-}  // namespace distributed
-}  // namespace paddle
+}  // namespace paddle::distributed
diff --git a/paddle/fluid/distributed/ps/service/coordinator_client.cc b/paddle/fluid/distributed/ps/service/coordinator_client.cc
index bf8233ec975fd..4f7381e6e8655 100644
--- a/paddle/fluid/distributed/ps/service/coordinator_client.cc
+++ b/paddle/fluid/distributed/ps/service/coordinator_client.cc
@@ -25,8 +25,7 @@
 static const int MIN_PORT = 8500;
 static const int MAX_PORT = 65535;
 
-namespace paddle {
-namespace distributed {
+namespace paddle::distributed {
 
 PD_DEFINE_uint64(total_fl_client_size, 100, "supported total fl client size");
 PD_DEFINE_uint32(coordinator_wait_all_clients_max_time, 60, "uint32: s");
@@ -201,5 +200,4 @@ void CoordinatorClient::SendFLStrategy(const uint32_t& client_id) {
   return;
 }
 
-}  // namespace distributed
-}  // namespace paddle
+}  // namespace paddle::distributed
diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_server.cc b/paddle/fluid/distributed/ps/service/graph_brpc_server.cc
index 9133b406424e4..99eccec948397 100644
--- a/paddle/fluid/distributed/ps/service/graph_brpc_server.cc
+++ b/paddle/fluid/distributed/ps/service/graph_brpc_server.cc
@@ -24,8 +24,7 @@
 #include "paddle/fluid/distributed/ps/service/brpc_ps_server.h"
 #include "paddle/fluid/framework/archive.h"
 #include "paddle/fluid/platform/profiler.h"
-namespace paddle {
-namespace distributed {
+namespace paddle::distributed {
 
 #define CHECK_TABLE_EXIST(table, request, response)        \
   if (table == NULL) {                                     \
@@ -704,5 +703,4 @@ int32_t GraphBrpcService::graph_set_node_feat(Table *table,
   return 0;
 }
 
-}  // namespace distributed
-}  // namespace paddle
+}  // namespace paddle::distributed
diff --git a/paddle/fluid/distributed/ps/service/ps_local_client.cc b/paddle/fluid/distributed/ps/service/ps_local_client.cc
index f98103fe28968..d864ab95724ca 100644
--- a/paddle/fluid/distributed/ps/service/ps_local_client.cc
+++ b/paddle/fluid/distributed/ps/service/ps_local_client.cc
@@ -15,8 +15,7 @@
 #include "paddle/fluid/distributed/ps/service/ps_local_client.h"
 #include "paddle/fluid/distributed/ps/table/table.h"
 
-namespace paddle {
-namespace distributed {
+namespace paddle::distributed {
 int32_t PsLocalClient::Initialize() {
   const auto& downpour_param = _config.server_param().downpour_server_param();
   TableManager::Instance().Initialize();
@@ -329,5 +328,4 @@ ::std::future<int32_t> PsLocalClient::SetDayId(size_t table_id, int day_id) {
   return done();
 }
 
-}  // namespace distributed
-}  // namespace paddle
+}  // namespace paddle::distributed
diff --git a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc
index ff4035a39d30f..342e113288a06 100644
--- a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc
+++ b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc
@@ -21,8 +21,7 @@
 #include "paddle/fluid/distributed/ps/table/table.h"
 #include "paddle/fluid/framework/archive.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
-namespace paddle {
-namespace distributed {
+namespace paddle::distributed {
 std::vector<std::string> GraphPyService::split(const std::string& str,
                                                const char pattern) {
   std::vector<std::string> res;
@@ -507,5 +506,4 @@ void GraphPyClient::StopServer() {
   if (status.get() == 0) stoped_ = true;
 }
 void GraphPyClient::FinalizeWorker() { this->worker_ptr->FinalizeWorker(); }
-}  // namespace distributed
-}  // namespace paddle
+}  // namespace paddle::distributed
diff --git a/paddle/fluid/distributed/ps/service/ps_service/service.cc b/paddle/fluid/distributed/ps/service/ps_service/service.cc
index b4402bea10ed4..124ae0d8b0837 100644
--- a/paddle/fluid/distributed/ps/service/ps_service/service.cc
+++ b/paddle/fluid/distributed/ps/service/ps_service/service.cc
@@ -25,8 +25,7 @@
 
 using namespace std;  // NOLINT
 
-namespace paddle {
-namespace distributed {
+namespace paddle::distributed {
 
 ::paddle::distributed::PSParameter load_from_prototxt(
     const std::string& filename) {
@@ -134,5 +133,4 @@ int PSCore::StopServer() {
   return 0;
 }
 ::paddle::distributed::PSParameter* PSCore::GetParam() { return &_ps_param; }
-}  // namespace distributed
-}  // namespace paddle
+}  // namespace paddle::distributed
diff --git a/paddle/fluid/distributed/ps/service/server.cc b/paddle/fluid/distributed/ps/service/server.cc
index 10951ba5dc428..2937d691c980b 100755
--- a/paddle/fluid/distributed/ps/service/server.cc
+++ b/paddle/fluid/distributed/ps/service/server.cc
@@ -20,8 +20,7 @@
 #include "paddle/fluid/distributed/ps/service/ps_local_server.h"
 #include "paddle/fluid/distributed/ps/table/table.h"
 
-namespace paddle {
-namespace distributed {
+namespace paddle::distributed {
 
 REGISTER_PSCORE_CLASS(PSServer, BrpcPsServer);
 REGISTER_PSCORE_CLASS(PSServer, PsLocalServer);
@@ -107,5 +106,4 @@ int32_t PSServer::Configure(
 
   return Initialize();
 }
-}  // namespace distributed
-}  // namespace paddle
+}  // namespace paddle::distributed
diff --git a/paddle/fluid/distributed/ps/table/barrier_table.cc b/paddle/fluid/distributed/ps/table/barrier_table.cc
index f665a024a78af..b00a283c6b754 100644
--- a/paddle/fluid/distributed/ps/table/barrier_table.cc
+++ b/paddle/fluid/distributed/ps/table/barrier_table.cc
@@ -14,8 +14,7 @@
 
 #include "paddle/fluid/distributed/ps/table/common_table.h"
 
-namespace paddle {
-namespace distributed {
+namespace paddle::distributed {
 
 int32_t BarrierTable::Initialize() {
   auto trainers = _config.common().trainer_num();
@@ -74,5 +73,4 @@ int32_t BarrierTable::SetTableMap(
   return 0;
 }
 
-}  // namespace distributed
-}  // namespace paddle
+}  // namespace paddle::distributed
diff --git a/paddle/fluid/distributed/ps/table/ctr_accessor.cc b/paddle/fluid/distributed/ps/table/ctr_accessor.cc
index 70954f0b7ad96..b5f185ed1f00e 100644
--- a/paddle/fluid/distributed/ps/table/ctr_accessor.cc
+++ b/paddle/fluid/distributed/ps/table/ctr_accessor.cc
@@ -18,8 +18,7 @@
 #include "paddle/common/flags.h"
 #include "paddle/utils/string/string_helper.h"
 
-namespace paddle {
-namespace distributed {
+namespace paddle::distributed {
 
 int CtrCommonAccessor::Initialize() {
   auto name = _config.embed_sgd_param().name();
@@ -341,5 +340,4 @@ int CtrCommonAccessor::ParseFromString(const std::string& str, float* value) {
   return ret;
 }
 
-}  // namespace distributed
-}  // namespace paddle
+}  // namespace paddle::distributed
diff --git a/paddle/fluid/distributed/ps/table/graph/graph_edge.cc b/paddle/fluid/distributed/ps/table/graph/graph_edge.cc
index 038dcf950ab50..dbd778cdcf055 100644
--- a/paddle/fluid/distributed/ps/table/graph/graph_edge.cc
+++ b/paddle/fluid/distributed/ps/table/graph/graph_edge.cc
@@ -14,8 +14,7 @@
 
 #include "paddle/fluid/distributed/ps/table/graph/graph_edge.h"
 #include <cstring>
-namespace paddle {
-namespace distributed {
+namespace paddle::distributed {
 
 void GraphEdgeBlob::add_edge(int64_t id, float weight = 1) {
   id_arr.push_back(id);
@@ -27,5 +26,4 @@ void WeightedGraphEdgeBlob::add_edge(int64_t id, float weight = 1) {
   weight_arr.push_back((half)weight);
 #endif
 }
-}  // namespace distributed
-}  // namespace paddle
+}  // namespace paddle::distributed
diff --git a/paddle/fluid/distributed/ps/table/graph/graph_node.cc b/paddle/fluid/distributed/ps/table/graph/graph_node.cc
index 31c098c49fba2..fa8fa61a23eab 100644
--- a/paddle/fluid/distributed/ps/table/graph/graph_node.cc
+++ b/paddle/fluid/distributed/ps/table/graph/graph_node.cc
@@ -15,8 +15,7 @@
 #include "paddle/fluid/distributed/ps/table/graph/graph_node.h"
 
 #include <cstring>
-namespace paddle {
-namespace distributed {
+namespace paddle::distributed {
 
 GraphNode::~GraphNode() {
   if (sampler != nullptr) {
@@ -122,5 +121,4 @@ void FeatureNode::recover_from_buffer(char* buffer) {
     feature.push_back(str);  // NOLINT
   }
 }
-}  // namespace distributed
-}  // namespace paddle
+}  // namespace paddle::distributed
diff --git a/paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.cc b/paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.cc
index 86871154ca23f..a27b82c812a55 100644
--- a/paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.cc
+++ b/paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.cc
@@ -19,8 +19,7 @@
 #include <unordered_map>
 
 #include "paddle/phi/core/generator.h"
-namespace paddle {
-namespace distributed {
+namespace paddle::distributed {
 
 void RandomSampler::build(GraphEdgeBlob *edges) { this->edges = edges; }
 
@@ -30,6 +29,7 @@ std::vector<int> RandomSampler::sample_k(
   if (k >= n) {
     k = n;
     std::vector<int> sample_result;
+    sample_result.reserve(k);
     for (int i = 0; i < k; i++) {
       sample_result.push_back(i);
     }
@@ -116,6 +116,7 @@ std::vector<int> WeightedSampler::sample_k(
   if (k >= count) {
     k = count;
     std::vector<int> sample_result;
+    sample_result.reserve(k);
     for (int i = 0; i < k; i++) {
       sample_result.push_back(i);
     }
@@ -164,5 +165,4 @@ int WeightedSampler::sample(
   subtract_count_map[this]++;
   return return_idx;
 }
-}  // namespace distributed
-}  // namespace paddle
+}  // namespace paddle::distributed
diff --git a/paddle/fluid/distributed/ps/table/memory_dense_table.cc b/paddle/fluid/distributed/ps/table/memory_dense_table.cc
index 2e68bdce1931f..9f6abd17ef2bf 100644
--- a/paddle/fluid/distributed/ps/table/memory_dense_table.cc
+++ b/paddle/fluid/distributed/ps/table/memory_dense_table.cc
@@ -16,8 +16,7 @@
 
 #include "paddle/fluid/platform/enforce.h"
 
-namespace paddle {
-namespace distributed {
+namespace paddle::distributed {
 
 int FLAGS_pslib_table_save_max_retry_dense = 3;
 
@@ -416,5 +415,4 @@ int32_t MemoryDenseTable::Save(const std::string &path,
   return feasign_size;
 }
 
-}  // namespace distributed
-}  // namespace paddle
+}  // namespace paddle::distributed
diff --git a/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.cc b/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.cc
index 4fd627bbf807c..4614978d41e51 100644
--- a/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.cc
+++ b/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.cc
@@ -14,8 +14,7 @@
 
 #include "paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h"
 
-namespace paddle {
-namespace distributed {
+namespace paddle::distributed {
 
 int32_t MemorySparseGeoTable::Pull(TableContext& context) {
   CHECK(context.value_type == Sparse);
@@ -242,5 +241,4 @@ int32_t MemorySparseGeoTable::_PushSparse(const uint64_t* keys,
   return 0;
 }
 
-}  // namespace distributed
-}  // namespace paddle
+}  // namespace paddle::distributed
diff --git a/paddle/fluid/distributed/ps/table/sparse_accessor.cc b/paddle/fluid/distributed/ps/table/sparse_accessor.cc
index 5689ccfe7a594..91e83015b6631 100644
--- a/paddle/fluid/distributed/ps/table/sparse_accessor.cc
+++ b/paddle/fluid/distributed/ps/table/sparse_accessor.cc
@@ -18,8 +18,7 @@
 #include "paddle/common/flags.h"
 #include "paddle/utils/string/string_helper.h"
 
-namespace paddle {
-namespace distributed {
+namespace paddle::distributed {
 
 int SparseAccessor::Initialize() {
   auto name = _config.embed_sgd_param().name();
@@ -304,5 +303,4 @@ int SparseAccessor::ParseFromString(const std::string& str, float* value) {
   return ret;
 }
 
-}  // namespace distributed
-}  // namespace paddle
+}  // namespace paddle::distributed
diff --git a/paddle/fluid/distributed/ps/table/sparse_sgd_rule.cc b/paddle/fluid/distributed/ps/table/sparse_sgd_rule.cc
index d9b490a80bba6..fd6744df6edb7 100644
--- a/paddle/fluid/distributed/ps/table/sparse_sgd_rule.cc
+++ b/paddle/fluid/distributed/ps/table/sparse_sgd_rule.cc
@@ -20,8 +20,7 @@
 
 PD_DEFINE_bool(enable_show_scale_gradient, true, "enable show scale gradient");
 
-namespace paddle {
-namespace distributed {
+namespace paddle::distributed {
 
 void SparseNaiveSGDRule::LoadConfig(const SparseCommonSGDRuleParameter &param,
                                     size_t emb_dim) {
@@ -395,5 +394,4 @@ void SparseAdaGradV2SGDRule::InitValueWork(float *value,
   sgd[G2SumIndex()] = 0;
 }
 
-}  // namespace distributed
-}  // namespace paddle
+}  // namespace paddle::distributed
diff --git a/paddle/fluid/distributed/rpc/python_rpc_handler.cc b/paddle/fluid/distributed/rpc/python_rpc_handler.cc
index 13322114def64..1daf9ffc1dace 100644
--- a/paddle/fluid/distributed/rpc/python_rpc_handler.cc
+++ b/paddle/fluid/distributed/rpc/python_rpc_handler.cc
@@ -14,8 +14,7 @@
 
 #include "paddle/fluid/distributed/rpc/python_rpc_handler.h"
 
-namespace paddle {
-namespace distributed {
+namespace paddle::distributed {
 constexpr auto kInternalModule = "paddle.distributed.rpc.internal";
 
 py::object getFunction(const py::object& module, const char* name) {
@@ -63,5 +62,4 @@ std::shared_ptr<PythonRpcHandler> PythonRpcHandler::GetInstance() {
   return python_rpc_handler_;
 }
 
-}  // namespace distributed
-}  // namespace paddle
+}  // namespace paddle::distributed
diff --git a/paddle/fluid/distributed/rpc/rpc.proto b/paddle/fluid/distributed/rpc/rpc.proto
index 2da9e37ae88d9..d9bd22aa974fc 100644
--- a/paddle/fluid/distributed/rpc/rpc.proto
+++ b/paddle/fluid/distributed/rpc/rpc.proto
@@ -1,11 +1,11 @@
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
-// 
+//
 //     http://www.apache.org/licenses/LICENSE-2.0
-// 
+//
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/paddle/fluid/distributed/test/barrier_table_test.cc b/paddle/fluid/distributed/test/barrier_table_test.cc
index 31f0f0844345c..bb2885480d72a 100644
--- a/paddle/fluid/distributed/test/barrier_table_test.cc
+++ b/paddle/fluid/distributed/test/barrier_table_test.cc
@@ -22,8 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/distributed/ps/table/table.h"
 #include "paddle/fluid/distributed/the_one_ps.pb.h"
 
-namespace paddle {
-namespace distributed {
+namespace paddle::distributed {
 
 TEST(BarrierTable, Barrier) {
   int trainers = 2;
@@ -63,5 +62,4 @@ TEST(BarrierTable, Barrier) {
   ASSERT_EQ(ret, 0);
 }
 
-}  // namespace distributed
-}  // namespace paddle
+}  // namespace paddle::distributed
diff --git a/paddle/fluid/distributed/test/brpc_utils_test.cc b/paddle/fluid/distributed/test/brpc_utils_test.cc
index 3c98dd7667ade..ac691307ae1e2 100644
--- a/paddle/fluid/distributed/test/brpc_utils_test.cc
+++ b/paddle/fluid/distributed/test/brpc_utils_test.cc
@@ -19,11 +19,9 @@ limitations under the License. */
 #include "gtest/gtest.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 class Variable;
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
 
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
diff --git a/paddle/fluid/distributed/test/ctr_accessor_test.cc b/paddle/fluid/distributed/test/ctr_accessor_test.cc
index 36ba1426fe3d4..9aa4f9fb56050 100644
--- a/paddle/fluid/distributed/test/ctr_accessor_test.cc
+++ b/paddle/fluid/distributed/test/ctr_accessor_test.cc
@@ -22,8 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/distributed/ps/table/sparse_sgd_rule.h"
 #include "paddle/fluid/distributed/the_one_ps.pb.h"
 
-namespace paddle {
-namespace distributed {
+namespace paddle::distributed {
 REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseAdaGradSGDRule);
 REGISTER_PSCORE_CLASS(SparseValueSGDRule, StdAdaGradSGDRule);
 REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseAdamSGDRule);
@@ -315,5 +314,4 @@ TEST(downpour_feature_value_accessor_test, test_string_related) {
     ASSERT_FLOAT_EQ(value[i], 0);
   }
 }
-}  // namespace distributed
-}  // namespace paddle
+}  // namespace paddle::distributed
diff --git a/paddle/fluid/distributed/test/ctr_dymf_accessor_test.cc b/paddle/fluid/distributed/test/ctr_dymf_accessor_test.cc
index 429248a6eb4eb..48724b9336804 100644
--- a/paddle/fluid/distributed/test/ctr_dymf_accessor_test.cc
+++ b/paddle/fluid/distributed/test/ctr_dymf_accessor_test.cc
@@ -22,8 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/distributed/ps/table/sparse_sgd_rule.h"
 #include "paddle/fluid/distributed/the_one_ps.pb.h"
 
-namespace paddle {
-namespace distributed {
+namespace paddle::distributed {
 REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseAdaGradSGDRule);
 REGISTER_PSCORE_CLASS(SparseValueSGDRule, StdAdaGradSGDRule);
 REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseAdamSGDRule);
@@ -171,5 +170,4 @@ TEST(downpour_feature_value_accessor_test, test_string_related) {
   ASSERT_NE(acc->ParseFromString(str, value), 0);
   // make sure init_zero=true
 }
-}  // namespace distributed
-}  // namespace paddle
+}  // namespace paddle::distributed
diff --git a/paddle/fluid/distributed/test/feature_value_test.cc b/paddle/fluid/distributed/test/feature_value_test.cc
index 6e848c3e2f4e4..1852293177641 100644
--- a/paddle/fluid/distributed/test/feature_value_test.cc
+++ b/paddle/fluid/distributed/test/feature_value_test.cc
@@ -18,8 +18,7 @@ limitations under the License. */
 
 #include "gtest/gtest.h"
 
-namespace paddle {
-namespace distributed {
+namespace paddle::distributed {
 
 TEST(BENCHMARK, LargeScaleKV) {
   typedef SparseTableShard<uint64_t, FixedFeatureValue> shard_type;
@@ -46,5 +45,4 @@ TEST(BENCHMARK, LargeScaleKV) {
   ASSERT_FLOAT_EQ(value_data[3], 0.3);
 }
 
-}  // namespace distributed
-}  // namespace paddle
+}  // namespace paddle::distributed
diff --git a/paddle/fluid/distributed/test/table_test.cc b/paddle/fluid/distributed/test/table_test.cc
index 8908891d9f14f..8e1161a4944b0 100644
--- a/paddle/fluid/distributed/test/table_test.cc
+++ b/paddle/fluid/distributed/test/table_test.cc
@@ -17,8 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/distributed/the_one_ps.pb.h"
 // #include "paddle/fluid/distributed/ps/table/sparse_geo_table.h"
 
-namespace paddle {
-namespace distributed {
+namespace paddle::distributed {
 
 TEST(Table, Initialize) {
   TableParameter table_config;
@@ -29,5 +28,4 @@ TEST(Table, Initialize) {
   auto ret = table->Initialize(table_config, fs_config);
   ASSERT_EQ(ret, -1);
 }
-}  // namespace distributed
-}  // namespace paddle
+}  // namespace paddle::distributed
diff --git a/paddle/fluid/eager/CMakeLists.txt b/paddle/fluid/eager/CMakeLists.txt
index 5667a86876e19..041ea3ec3a286 100755
--- a/paddle/fluid/eager/CMakeLists.txt
+++ b/paddle/fluid/eager/CMakeLists.txt
@@ -13,6 +13,10 @@ set(eager_deps
     grad_tensor_holder
     custom_operator_node)
 
+if(WITH_GPU OR WITH_ROCM)
+  set(eager_deps ${eager_deps} phi_kernel_gpu)
+endif()
+
 if(NOT (NOT WITH_PYTHON AND ON_INFER))
   set(eager_deps ${eager_deps} accumulation_node prim_utils)
 endif()
diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h
index 853a0c445797c..247651ae149f5 100644
--- a/paddle/fluid/eager/to_static/run_program_op_node.h
+++ b/paddle/fluid/eager/to_static/run_program_op_node.h
@@ -18,6 +18,7 @@
 #include "paddle/fluid/eager/grad_node_info.h"
 #include "paddle/fluid/eager/tensor_wrapper.h"
 #include "paddle/fluid/framework/executor_cache.h"
+#include "paddle/fluid/framework/feed_hook.h"
 #include "paddle/fluid/framework/new_executor/interpretercore.h"
 #include "paddle/fluid/framework/tensor_ref_array.h"
 #include "paddle/fluid/framework/variable_helper.h"
@@ -583,6 +584,7 @@ inline void PirRunProgramAPI(
     //}
   }
 
+  paddle::framework::RunFeedHooks(*forward_program, *global_inner_scope);
   // interpretercore run
   if (!forward_program->block()->empty()) {
     paddle::platform::RecordEvent record_event(
@@ -869,7 +871,6 @@ inline void RunProgramGradAPI(
   auto *backward_global_block = PADDLE_GET_CONST(
       paddle::framework::BlockDesc *, attrs.at("backward_global_block"));
   auto *backward_program = backward_global_block->Program();
-
   details::Trans2ContiguousTensorsInplace(out_grad);
 
   auto out_grad_names = details::GetTensorsName(out_grad);
@@ -1155,6 +1156,7 @@ inline void PirRunProgramGradAPI(
     }
   }
 
+  paddle::framework::RunFeedHooks(*backward_program, *global_inner_scope);
   if (!backward_program->block()->empty()) {
     paddle::platform::RecordEvent record_event(
         "interpreter_core_run",
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 041339fe597c3..c8f3dc0d673f1 100755
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -515,6 +515,12 @@ cc_library(
   feed_fetch_method
   SRCS feed_fetch_method.cc
   DEPS lod_tensor scope glog)
+
+cc_library(
+  feed_hook
+  SRCS feed_hook.cc
+  DEPS lod_tensor scope glog pir)
+
 cc_library(
   variable_helper
   SRCS variable_helper.cc
@@ -529,6 +535,7 @@ set(NAIVE_EXECUTOR_DEPS
     glog
     lod_rank_table
     feed_fetch_method
+    feed_hook
     graph_to_program_pass
     standalone_executor
     variable_helper)
@@ -598,6 +605,7 @@ if(WITH_DISTRIBUTE)
            lodtensor_printer
            lod_rank_table
            feed_fetch_method
+           feed_hook
            collective_helper
            ${GLOB_DISTRIBUTE_DEPS}
            graph_to_program_pass
@@ -628,7 +636,7 @@ if(WITH_DISTRIBUTE)
     #         pull_dense_worker.cc section_worker.cc heter_section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
     #         device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
     #         index_sampler index_wrapper sampler index_dataset_proto
-    #         lod_rank_table framework_io fleet_wrapper heter_wrapper box_wrapper metrics lodtensor_printer feed_fetch_method
+    #         lod_rank_table framework_io fleet_wrapper heter_wrapper box_wrapper metrics lodtensor_printer feed_fetch_method feed_hook
     #         graph_to_program_pass variable_helper timer monitor
     #         heter_service_proto fleet heter_server brpc fleet_executor
     #         graph_gpu_wrapper)
@@ -677,6 +685,7 @@ if(WITH_DISTRIBUTE)
            metrics
            lodtensor_printer
            feed_fetch_method
+           feed_hook
            graph_to_program_pass
            variable_helper
            timer
@@ -750,6 +759,7 @@ if(WITH_DISTRIBUTE)
            metrics
            lodtensor_printer
            feed_fetch_method
+           feed_hook
            graph_to_program_pass
            variable_helper
            timer
@@ -808,6 +818,7 @@ elseif(WITH_PSLIB)
          box_wrapper
          lodtensor_printer
          feed_fetch_method
+         feed_hook
          graph_to_program_pass
          variable_helper
          timer
@@ -854,6 +865,7 @@ else()
          box_wrapper
          lodtensor_printer
          feed_fetch_method
+         feed_hook
          graph_to_program_pass
          variable_helper
          timer
diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc
index c01d845b6e145..595841d11170a 100644
--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@@ -31,8 +31,7 @@ limitations under the License. */
 
 USE_INT_STAT(STAT_total_feasign_num_in_mem);
 COMMON_DECLARE_bool(enable_ins_parser_file);
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 
 DLManager& global_dlmanager_pool() {
   static DLManager manager;
@@ -3267,5 +3266,4 @@ void MiniBatchGpuPack::transfer_to_gpu() {
 }
 #endif
 
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
diff --git a/paddle/fluid/framework/data_type.cc b/paddle/fluid/framework/data_type.cc
index 7dbd1c8484926..b4e1dcca1cf0b 100644
--- a/paddle/fluid/framework/data_type.cc
+++ b/paddle/fluid/framework/data_type.cc
@@ -24,8 +24,7 @@ using float16 = paddle::platform::float16;
 using bfloat16 = paddle::platform::bfloat16;
 using pstring = phi::dtype::pstring;
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 
 struct DataTypeMap {
   std::unordered_map<std::type_index, proto::VarType::Type> cpp_to_proto_;
@@ -163,5 +162,4 @@ proto::VarType::Type PromoteTypesIfComplexExists(
   return promote_types_table[type_an][type_bn];
 }
 
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index 4c78b12fd4ac4..5e4edb1ca2870 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -24,9 +24,7 @@
 COMMON_DECLARE_bool(sync_nccl_allreduce);
 #endif
 
-namespace paddle {
-namespace framework {
-namespace details {
+namespace paddle::framework::details {
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
@@ -335,6 +333,4 @@ void AllReduceOpHandle::SyncNCCLAllReduce() {
 #endif
 
 std::string AllReduceOpHandle::Name() const { return "all_reduce"; }
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::details
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc
index 7f4a37a18cbb1..896b70b8b9156 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -20,9 +20,7 @@
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 
-namespace paddle {
-namespace framework {
-namespace details {
+namespace paddle::framework::details {
 
 void BroadcastOpHandle::RunImpl() {
   platform::RecordEvent record_event(
@@ -266,6 +264,4 @@ void BroadcastOpHandle::InitOutputValue(
 }
 
 std::string BroadcastOpHandle::Name() const { return "broadcast"; }
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::details
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 79578e5653a22..e0a03099a881d 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -28,9 +28,7 @@ COMMON_DECLARE_bool(use_mkldnn);
 PD_DECLARE_bool(use_cinn);
 #endif
 
-namespace paddle {
-namespace framework {
-namespace details {
+namespace paddle::framework::details {
 
 static inline bool SeqOnlyAllReduceOps(const BuildStrategy &strategy) {
   // Should fix the allreduce op order if scheduling
@@ -503,9 +501,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
   return graph;
 }
 
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::details
 
 USE_PASS(sync_batch_norm_pass);
 USE_PASS(fuse_relu_depthwise_conv_pass);
diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc
index fdc12b697ac02..19f7ef1114b6f 100644
--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@@ -18,9 +18,7 @@
 
 COMMON_DECLARE_bool(allreduce_record_one_event);
 
-namespace paddle {
-namespace framework {
-namespace details {
+namespace paddle::framework::details {
 struct VarHandleBase;
 
 ComputationOpHandle::ComputationOpHandle(ir::Node *node,
@@ -55,6 +53,4 @@ bool ComputationOpHandle::NeedWait(VarHandleBase *in_var) {
 }
 
 std::string ComputationOpHandle::Name() const { return op_->Type(); }
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::details
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
index 4dbff851f00e2..b8db1e321257b 100644
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
@@ -21,15 +21,11 @@
 #endif
 #include <algorithm>
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 class Variable;
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
 
-namespace paddle {
-namespace framework {
-namespace details {
+namespace paddle::framework::details {
 
 EagerDeletionOpHandle::EagerDeletionOpHandle(
     ir::Node *node,
@@ -213,6 +209,4 @@ std::vector<std::string> EagerDeletionOpHandle::VarsToDelete() const {
   return var_names;
 }
 
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::details
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
index ab45fc7d061db..f947794ccdd05 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -26,9 +26,7 @@
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 
-namespace paddle {
-namespace framework {
-namespace details {
+namespace paddle::framework::details {
 
 FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
     const ExecutionStrategy &strategy,
@@ -390,6 +388,4 @@ bool FastThreadedSSAGraphExecutor::RunOpSync(OpHandleBase *op) {
   }
 }
 
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::details
diff --git a/paddle/fluid/framework/details/fetch_async_op_handle.cc b/paddle/fluid/framework/details/fetch_async_op_handle.cc
index ee78d36671107..e09b6ec2a5719 100644
--- a/paddle/fluid/framework/details/fetch_async_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_async_op_handle.cc
@@ -20,9 +20,7 @@
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 
-namespace paddle {
-namespace framework {
-namespace details {
+namespace paddle::framework::details {
 
 FetchAsyncOpHandle::FetchAsyncOpHandle(ir::Node *node,
                                        FetchResultType *data,
@@ -306,6 +304,4 @@ bool FetchAsyncOpHandle::IsMultiDeviceTransfer() { return true; }
 
 std::string FetchAsyncOpHandle::Name() const { return "FetchAsync"; }
 
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::details
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index 2ca24a6bbbb88..39a91d0e8e645 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -18,9 +18,7 @@
 
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 
-namespace paddle {
-namespace framework {
-namespace details {
+namespace paddle::framework::details {
 
 FetchOpHandle::FetchOpHandle(ir::Node *node,
                              FetchResultType *data,
@@ -182,6 +180,4 @@ bool FetchOpHandle::IsMultiDeviceTransfer() { return true; }
 
 std::string FetchOpHandle::Name() const { return "Fetch"; }
 
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::details
diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
index 738a74d1d846f..c8117653d12bf 100644
--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
@@ -19,14 +19,12 @@
 #include "paddle/fluid/framework/details/variable_visitor.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
-#include "paddle/phi/backends/device_memory_aligment.h"
+#include "paddle/phi/backends/device_memory_alignment.h"
 
 PD_DEFINE_bool(skip_fused_all_reduce_check, false, "");  // NOLINT
 COMMON_DECLARE_bool(allreduce_record_one_event);
 
-namespace paddle {
-namespace framework {
-namespace details {
+namespace paddle::framework::details {
 
 typedef std::vector<
     std::vector<std::pair<std::string, const phi::DenseTensor *>>>
@@ -407,6 +405,4 @@ void FusedAllReduceOpHandle::GetDTypeAndNumel(
 }
 
 std::string FusedAllReduceOpHandle::Name() const { return "fused_all_reduce"; }
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::details
diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle.cc b/paddle/fluid/framework/details/fused_broadcast_op_handle.cc
index c446d3502e4e4..01c833474bc1a 100644
--- a/paddle/fluid/framework/details/fused_broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.cc
@@ -17,9 +17,7 @@
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 
-namespace paddle {
-namespace framework {
-namespace details {
+namespace paddle::framework::details {
 
 void FusedBroadcastOpHandle::RunImpl() {
   platform::RecordEvent record_event(
@@ -58,6 +56,4 @@ void FusedBroadcastOpHandle::RunImpl() {
 
 std::string FusedBroadcastOpHandle::Name() const { return "fused_broadcast"; }
 
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::details
diff --git a/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc b/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc
index 3e58662be1b82..49a8b3904374d 100644
--- a/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc
@@ -20,9 +20,7 @@
 COMMON_DECLARE_bool(sync_nccl_allreduce);
 #endif
 
-namespace paddle {
-namespace framework {
-namespace details {
+namespace paddle::framework::details {
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 GradMergeAllReduceOpHandle::GradMergeAllReduceOpHandle(
@@ -136,6 +134,4 @@ std::string FusedGradMergeAllReduceOpHandle::Name() const {
   return "fused_grad_merge_all_reduce";
 }
 
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::details
diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
index b917c161193fb..45660331c1202 100644
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
@@ -20,9 +20,7 @@
 
 #include "paddle/fluid/framework/ir/graph_helper.h"
 
-namespace paddle {
-namespace framework {
-namespace details {
+namespace paddle::framework::details {
 
 static std::vector<std::unique_ptr<ir::Graph>> SeparateMultiDevicesGraph(
     ir::Graph *graph, size_t place_num) {
@@ -332,6 +330,4 @@ FetchResultType ParallelSSAGraphExecutor::Run(
   }
 }
 
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::details
diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
index fe43126ca8abe..05e1693eb650e 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -26,9 +26,7 @@ PADDLE_DEFINE_EXPORTED_bool(
     false,
     "Whether to make the result of computation deterministic in CPU side.");
 
-namespace paddle {
-namespace framework {
-namespace details {
+namespace paddle::framework::details {
 
 std::once_flag CollectiveContext::init_flag_;
 std::unique_ptr<CollectiveContext> CollectiveContext::context_;
@@ -318,6 +316,4 @@ std::vector<const T *> ReduceOpHandle::GetInputValues(
 }
 
 std::string ReduceOpHandle::Name() const { return "reduce"; }
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::details
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
index 8b486be9cc686..2cdfcf5687f93 100644
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -23,9 +23,7 @@ namespace phi {
 class DenseTensor;
 }  // namespace phi
 
-namespace paddle {
-namespace framework {
-namespace details {
+namespace paddle::framework::details {
 ScaleLossGradOpHandle::ScaleLossGradOpHandle(ir::Node *node,
                                              size_t num_dev,
                                              Scope *scope,
@@ -126,6 +124,4 @@ void ScaleLossGradOpHandle::RunOnVar(Variable *var, bool record_event) {
 }
 
 std::string ScaleLossGradOpHandle::Name() const { return "ScaleLossGrad"; }
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::details
diff --git a/paddle/fluid/framework/details/scope_buffered_monitor.cc b/paddle/fluid/framework/details/scope_buffered_monitor.cc
index 14e109bb5381b..e3b3446209584 100644
--- a/paddle/fluid/framework/details/scope_buffered_monitor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_monitor.cc
@@ -17,17 +17,13 @@
 #include "paddle/common/flags.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 class Variable;
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
 
 COMMON_DECLARE_double(local_exe_sub_scope_limit);
 
-namespace paddle {
-namespace framework {
-namespace details {
+namespace paddle::framework::details {
 
 static constexpr double kMB = 1.0 / (1024.0 * 1024.0);
 
@@ -208,6 +204,4 @@ void ScopeBufferedMonitor::ClearHistoryLocalExecScopes() {
   history_local_exec_scopes_.clear();
 }
 
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::details
diff --git a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
index 02a68fb697efb..fe516be34c93d 100644
--- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
+++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
@@ -18,17 +18,11 @@
 
 #include "paddle/fluid/platform/enforce.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 class MemOptVarInfo;
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
-namespace paddle {
-namespace framework {
-namespace details {
+namespace paddle::framework::details {
 
 class ComputationOpHandle;
 
@@ -103,6 +97,4 @@ void ShareTensorBufferOpHandle::InitCUDA() {
 
 void ShareTensorBufferOpHandle::RunImpl() { functor_(local_exec_scopes_[0]); }
 
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::details
diff --git a/paddle/fluid/framework/device_worker_factory.cc b/paddle/fluid/framework/device_worker_factory.cc
index 5c920fa3e318f..64180ec9a9b98 100644
--- a/paddle/fluid/framework/device_worker_factory.cc
+++ b/paddle/fluid/framework/device_worker_factory.cc
@@ -19,8 +19,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 
 class DeviceWorker;
 
@@ -86,5 +85,4 @@ REGISTER_DEVICE_WORKER_CLASS(PSGPUWorker);
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 REGISTER_DEVICE_WORKER_CLASS(SectionWorker);
 #endif
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index bd4530f906fac..1f369b869b105 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -33,8 +33,7 @@ limitations under the License. */
 PD_DECLARE_bool(benchmark);
 COMMON_DECLARE_bool(use_mkldnn);
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 namespace {
 // block id starts from 0. This id is used to represent the codeblock
 // wrapping the first block 0.
@@ -609,5 +608,4 @@ void Executor::EnableMKLDNN(const ProgramDesc& program) {
       << "'MKLDNN' is not supported, Please re-compile with WITH_ONEDNN option";
 #endif
 }
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
diff --git a/paddle/fluid/framework/executor_cache.cc b/paddle/fluid/framework/executor_cache.cc
index 9045ca0f6a17d..97bcf41845039 100644
--- a/paddle/fluid/framework/executor_cache.cc
+++ b/paddle/fluid/framework/executor_cache.cc
@@ -32,16 +32,11 @@ DECLARE_FILE_SYMBOLS(print_statistics);
 COMMON_DECLARE_bool(pir_apply_inplace_pass);
 COMMON_DECLARE_bool(print_ir);
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 class ProgramDesc;
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
 
-namespace paddle {
-namespace framework {
-
-namespace details {
+namespace paddle::framework::details {
 
 static ExecutionStrategy GetExecutionStrategy(const platform::Place &place) {
   framework::ExecutionStrategy execution_strategy;
@@ -208,7 +203,8 @@ std::set<std::string> ParseSafeEagerDeletionSkipVarsSet(
   VLOG(1) << "Found skip_eager_delete_vars: " << skip_eager_delete_vars.size();
   return skip_eager_delete_vars;
 }
-}  // namespace details
+}  // namespace paddle::framework::details
+namespace paddle::framework {
 
 // C++11 removes the need for manual locking. Concurrent execution shall wait if
 // a static local variable is already being initialized.
@@ -588,5 +584,4 @@ std::unique_ptr<::pir::Program> ConstructBackwardIrProgram(
   return res;
 }
 
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc
index 46543e7cba9bd..3e7f8d77bf93d 100644
--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@@ -25,8 +25,7 @@ namespace phi {
 class DenseTensor;
 }  // namespace phi
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 
 class Variable;
 
@@ -130,5 +129,4 @@ phi::DenseTensor& GetVariableTensor(const Scope& scope,
   return *var->GetMutable<phi::DenseTensor>();
 }
 
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
diff --git a/paddle/fluid/framework/feed_hook.cc b/paddle/fluid/framework/feed_hook.cc
new file mode 100644
index 0000000000000..180f51d7fcaf3
--- /dev/null
+++ b/paddle/fluid/framework/feed_hook.cc
@@ -0,0 +1,139 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/feed_hook.h"
+#include <fstream>
+#include <limits>
+#include <random>
+#include <sstream>
+#include "paddle/common/flags.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+#include "paddle/pir/include/core/program.h"
+
+COMMON_DECLARE_string(logging_pir_py_code_dir);
+COMMON_DECLARE_bool(logging_trunc_pir_py_code);
+
+namespace paddle::framework {
+
+namespace {
+
+std::optional<std::string> GetLoggingFilePath() {
+  if (FLAGS_logging_pir_py_code_dir.empty()) return std::nullopt;
+  const std::string file_path =
+      FLAGS_logging_pir_py_code_dir + "/programs_example_input_tensor_meta.py";
+  return file_path;
+}
+
+void TryTruncateLoggingFile() {
+  if (!FLAGS_logging_trunc_pir_py_code) return;
+  std::optional<std::string> file_path = GetLoggingFilePath();
+  if (!file_path.has_value()) return;
+  static std::once_flag once_flag;
+  std::call_once(once_flag, [&] {
+    std::ofstream ofs;
+    ofs.open(file_path.value().c_str(), std::ios::out | std::ios::trunc);
+    ofs.close();
+  });
+}
+
+template <typename DoEachFeadNameT>
+void VisitFeedName(const pir::Program& program,
+                   const DoEachFeadNameT& DoEachFeadName) {
+  auto module_op = program.module_op();
+  const auto& block = module_op.block();
+  const auto& IsDataOp = [](const pir::Operation& op) -> bool {
+    return op.isa<paddle::dialect::DataOp>();
+  };
+  const auto& GetDataOpName = [](const pir::Operation& op) -> std::string {
+    return op.attributes().at("name").dyn_cast<pir::StrAttribute>().AsString();
+  };
+  for (const auto& op : block) {
+    if (IsDataOp(op)) {
+      DoEachFeadName(GetDataOpName(op));
+    }
+  }
+  for (const auto& [name, _] : block.kwargs()) {
+    DoEachFeadName(name);
+  }
+}
+
+std::string GetLoggingShapeOrDataForName(int64_t program_id,
+                                         const std::string& name,
+                                         const phi::DenseTensor& tensor) {
+  int64_t random_id = [&] {
+    std::random_device rd{};
+    std::mt19937_64 gen(rd());
+    std::uniform_int_distribution<int64_t> dis(
+        0, std::numeric_limits<int64_t>::max());
+    return dis(gen);
+  }();
+  std::ostringstream ss;
+  ss << "class PirProgram_example_input_tensor_meta_" << random_id << ":";
+  ss << "\n\tprogram_id = " << program_id;
+  ss << "\n\tinput_name = " << std::quoted(name);
+  ss << "\n\tshape = [";
+  int i = 0;
+  for (int dim : ::common::vectorize<int64_t>(tensor.dims())) {
+    if (i++ > 0) {
+      ss << ", ";
+    }
+    ss << dim;
+  }
+  ss << "]";
+  ss << "\n\n";
+  return ss.str();
+}
+
+void AppendToLoggingFile(const std::string& logging_str) {
+  std::optional<std::string> file_path = GetLoggingFilePath();
+  if (!file_path.has_value()) return;
+  std::ofstream ofs;
+  ofs.open(file_path.value().c_str(), std::ios::out | std::ios::app);
+  if (!ofs.is_open()) return;
+  ofs << logging_str << std::endl;
+  ofs.close();
+}
+
+void AppendLoggingShapeOrDataForName(int64_t uid,
+                                     const std::string& name,
+                                     const phi::DenseTensor& tensor) {
+  static std::mutex mutex;
+  std::unique_lock<std::mutex> lock(mutex);
+  using Name2OnceFlag = std::unordered_map<std::string, std::once_flag>;
+  static std::unordered_map<int64_t, Name2OnceFlag> once_flags;
+  std::call_once(once_flags[uid][name], [&] {
+    AppendToLoggingFile(GetLoggingShapeOrDataForName(uid, name, tensor));
+  });
+}
+
+void SaveLoggingShapeOrData(const pir::Program& program, const Scope& scope) {
+  if (FLAGS_logging_pir_py_code_dir.empty()) return;
+  TryTruncateLoggingFile();
+  VisitFeedName(program, [&](const std::string& name) {
+    Variable* variable = scope.FindVar(name);
+    if (variable == nullptr) return;
+    if (!variable->IsType<phi::DenseTensor>()) return;
+    const phi::DenseTensor& tensor = variable->Get<phi::DenseTensor>();
+    AppendLoggingShapeOrDataForName(program.id(), name, tensor);
+  });
+}
+
+}  // namespace
+
+void RunFeedHooks(const pir::Program& program, const Scope& scope) {
+  SaveLoggingShapeOrData(program, scope);
+}
+
+}  // namespace paddle::framework
diff --git a/paddle/fluid/pybind/parallel_executor.h b/paddle/fluid/framework/feed_hook.h
similarity index 70%
rename from paddle/fluid/pybind/parallel_executor.h
rename to paddle/fluid/framework/feed_hook.h
index 3c3acace033a7..3a8584e3899b6 100644
--- a/paddle/fluid/pybind/parallel_executor.h
+++ b/paddle/fluid/framework/feed_hook.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -14,12 +14,16 @@
 
 #pragma once
 
-#include "pybind11/pybind11.h"
+namespace pir {
 
-namespace paddle {
-namespace pybind {
+class Program;
 
-void BindParallelExecutor(pybind11::module& m);  // NOLINT
+}
 
-}  // namespace pybind
-}  // namespace paddle
+namespace paddle::framework {
+
+class Scope;
+
+void RunFeedHooks(const pir::Program& program, const Scope& scope);
+
+}  // namespace paddle::framework
diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc
index 3583e352956b5..c4b457a20a0ed 100644
--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -24,8 +24,7 @@ COMMON_DECLARE_double(eager_delete_tensor_gb);
 COMMON_DECLARE_double(memory_fraction_of_eager_deletion);
 COMMON_DECLARE_bool(fast_eager_deletion_mode);
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 
 GarbageCollector::GarbageCollector(const platform::Place &place,
                                    size_t max_memory_size)
@@ -249,5 +248,4 @@ std::unique_ptr<GarbageCollector> CreateGarbageCollector(
   return std::unique_ptr<GarbageCollector>(gc.release());
 }
 
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
diff --git a/paddle/fluid/framework/io/crypto/cipher.cc b/paddle/fluid/framework/io/crypto/cipher.cc
index 134931778dd51..e81e03ae17138 100644
--- a/paddle/fluid/framework/io/crypto/cipher.cc
+++ b/paddle/fluid/framework/io/crypto/cipher.cc
@@ -18,8 +18,7 @@
 #include "paddle/fluid/framework/io/crypto/cipher_utils.h"
 #include "paddle/fluid/platform/enforce.h"
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 
 std::shared_ptr<Cipher> CipherFactory::CreateCipher(
     const std::string& config_file) {
@@ -57,5 +56,4 @@ std::shared_ptr<Cipher> CipherFactory::CreateCipher(
   return nullptr;
 }
 
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
diff --git a/paddle/fluid/framework/io/save_load_tensor.cc b/paddle/fluid/framework/io/save_load_tensor.cc
index b8a52e9c44fbf..9b5beb5ce9c45 100644
--- a/paddle/fluid/framework/io/save_load_tensor.cc
+++ b/paddle/fluid/framework/io/save_load_tensor.cc
@@ -19,8 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/phi/common/port.h"
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 
 void SaveTensor(const phi::DenseTensor& x,
                 const std::string& file_path,
@@ -54,5 +53,4 @@ void LoadTensor(const std::string& file_path, phi::DenseTensor* out) {
 
   framework::DeserializeFromStream(fin, out);
 }
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
diff --git a/paddle/fluid/framework/io/save_paddle2cinn_varmap.cc b/paddle/fluid/framework/io/save_paddle2cinn_varmap.cc
index f4debede0a616..ab922a9b400c6 100644
--- a/paddle/fluid/framework/io/save_paddle2cinn_varmap.cc
+++ b/paddle/fluid/framework/io/save_paddle2cinn_varmap.cc
@@ -16,8 +16,7 @@ limitations under the License. */
 #include "paddle/phi/common/port.h"
 #include "paddle/phi/core/enforce.h"
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 
 void save_paddle2cinn_varmap(
     std::unordered_map<std::string, std::string> paddle2cinn_var_map,
@@ -45,5 +44,4 @@ void save_paddle2cinn_varmap(
   outfile.close();
 }
 
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index b3ff3ac35d96d..a5f1d3bea2e7d 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -334,6 +334,8 @@ if(WITH_XPU)
                DEPS ${XPU_PASS_DEPS})
   pass_library(weight_only_linear_xpu_pass inference DIR xpu DEPS
                ${XPU_PASS_DEPS})
+  pass_library(block_multihead_attention_xpu_pass inference DIR xpu DEPS
+               ${XPU_PASS_DEPS})
 endif()
 
 cc_library(
diff --git a/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass.cc b/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass.cc
index 18c7dcc196b5a..376d8f88c015f 100644
--- a/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass.cc
+++ b/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass.cc
@@ -20,9 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 AdaptivePool2dConvertGlobalPass::AdaptivePool2dConvertGlobalPass() {  // NOLINT
   AddOpCompat(OpCompat("pool2d"))
@@ -99,9 +97,7 @@ void AdaptivePool2dConvertGlobalPass::ApplyImpl(ir::Graph* graph) const {
   AddStatis(num);
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(adaptive_pool2d_convert_global_pass,
               paddle::framework::ir::AdaptivePool2dConvertGlobalPass);
diff --git a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
index 8a2541de5aae4..f54a86ab97bd5 100644
--- a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
+++ b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
@@ -26,9 +26,7 @@
 #include "paddle/phi/backends/device_manager.h"
 #endif
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 namespace {
 
@@ -1023,9 +1021,7 @@ void AutoMixedPrecisionPass::InsertCastOp() const {
   VLOG(4) << "insert number of cast op: " << cache.size();
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(auto_mixed_precision_pass,
               paddle::framework::ir::AutoMixedPrecisionPass);
diff --git a/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc b/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc
index 966f4ea14967d..1f70e732a7fe2 100644
--- a/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc
+++ b/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc
@@ -20,12 +20,10 @@
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/platform/flags.h"
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 class ProgramDesc;
 class VarDesc;
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
 
 PADDLE_DEFINE_EXPORTED_double(
     fuse_parameter_memory_size,
@@ -46,9 +44,7 @@ PADDLE_DEFINE_EXPORTED_int32(
     "-1, it means that there are only one group. The default value is 3, it is "
     "an experimental value.");
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 // unit of the FLAGS_fuse_parameter_memory_size.
 static constexpr double kMB = 1048576.0;
 
@@ -600,9 +596,7 @@ class CoalesceGradTensorPass : public ir::Pass {
     op_desc->SetAttr("persist_output", persistable);
   }
 };
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(coalesce_grad_tensor_pass,
               paddle::framework::ir::CoalesceGradTensorPass)
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
index cd823afa96dd4..403aa38102945 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
@@ -20,9 +20,7 @@
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 #define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern);
 #define GET_NODES                      \
@@ -236,9 +234,7 @@ void ConvElementwiseAdd2ActFusePass::ApplyImpl(ir::Graph* graph) const {
   AddStatis(found_count);
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(conv_elementwise_add2_act_fuse_pass,
               paddle::framework::ir::ConvElementwiseAdd2ActFusePass);
diff --git a/paddle/fluid/framework/ir/delete_assign_op_pass_test.cc b/paddle/fluid/framework/ir/delete_assign_op_pass_test.cc
index 92477747fe2be..a1148e1caa7ce 100644
--- a/paddle/fluid/framework/ir/delete_assign_op_pass_test.cc
+++ b/paddle/fluid/framework/ir/delete_assign_op_pass_test.cc
@@ -16,9 +16,7 @@
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 TEST(delete_assign_op_pass, basic) {
   ProgramDesc program;
@@ -43,8 +41,6 @@ TEST(delete_assign_op_pass, basic) {
           assign_num));
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 USE_PASS(delete_assign_op_pass);
diff --git a/paddle/fluid/framework/ir/delete_cast_op_pass.cc b/paddle/fluid/framework/ir/delete_cast_op_pass.cc
index 59fd42241e0d4..c96603f03ad30 100644
--- a/paddle/fluid/framework/ir/delete_cast_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_cast_op_pass.cc
@@ -26,16 +26,11 @@ namespace phi {
 class DenseTensor;
 }  // namespace phi
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 class Scope;
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
 
-namespace paddle {
-namespace framework {
-namespace ir {
-namespace patterns {
+namespace paddle::framework::ir::patterns {
 struct CastWritePattern : public PatternBase {
   CastWritePattern(PDPattern* pattern, const std::string& name_scope);
 
@@ -75,7 +70,8 @@ CastWritePattern::CastWritePattern(PDPattern* pattern,
   cast0->LinksFrom({cast0_in}).LinksTo({cast0_out});
   write_to_array->LinksFrom({cast0_out}).LinksTo({write_to_array_out});
 }
-}  // namespace patterns
+}  // namespace paddle::framework::ir::patterns
+namespace paddle::framework::ir {
 
 static std::vector<Node*> FindOpNodeWithInputName(
     ir::Graph* graph, const std::string& input_name) {
@@ -218,7 +214,8 @@ int DeleteCastOpPass::ApplyCastWriteReadPass(ir::Graph* graph) const {
   return found_subgraph_count;
 }
 
-namespace patterns {
+}  // namespace paddle::framework::ir
+namespace paddle::framework::ir::patterns {
 struct CastLodResetWritePattern : public PatternBase {
   CastLodResetWritePattern(PDPattern* pattern, const std::string& name_scope);
 
@@ -267,7 +264,8 @@ CastLodResetWritePattern::CastLodResetWritePattern(
   lod_reset->LinksFrom({cast0_out}).LinksTo({lod_reset_out});
   write_to_array->LinksFrom({lod_reset_out}).LinksTo({write_to_array_out});
 }
-}  // namespace patterns
+}  // namespace paddle::framework::ir::patterns
+namespace paddle::framework::ir {
 
 int DeleteCastOpPass::ApplyCastLodResetWriteReadPass(ir::Graph* graph) const {
   if (graph->SubGraphsSize() != 2) {
@@ -418,7 +416,8 @@ int DeleteCastOpPass::ApplyCastLodResetWriteReadPass(ir::Graph* graph) const {
   return found_subgraph_count;
 }
 
-namespace patterns {
+}  // namespace paddle::framework::ir
+namespace paddle::framework::ir::patterns {
 struct CastIndexSamplePattern : public PatternBase {
   CastIndexSamplePattern(PDPattern* pattern, const std::string& name_scope);
 
@@ -475,7 +474,8 @@ CastIndexSamplePattern::CastIndexSamplePattern(PDPattern* pattern,
   index_sample->LinksFrom({cast0_out}).LinksTo({index_sample_out});
   cast1->LinksFrom({index_sample_out}).LinksTo({cast1_out});
 }
-}  // namespace patterns
+}  // namespace paddle::framework::ir::patterns
+namespace paddle::framework::ir {
 
 int DeleteCastOpPass::ApplyCastIndexSamplePass(ir::Graph* graph) const {
   GraphPatternDetector gpd;
@@ -509,7 +509,8 @@ int DeleteCastOpPass::ApplyCastIndexSamplePass(ir::Graph* graph) const {
   return found_subgraph_count;
 }
 
-namespace patterns {
+}  // namespace paddle::framework::ir
+namespace paddle::framework::ir::patterns {
 struct CastScatterPattern : public PatternBase {
   CastScatterPattern(PDPattern* pattern, const std::string& name_scope);
 
@@ -587,7 +588,8 @@ CastScatterPattern::CastScatterPattern(PDPattern* pattern,
   scatter->LinksFrom({cast0_out, cast1_out}).LinksTo({scatter_out});
   cast2->LinksFrom({scatter_out}).LinksTo({cast2_out});
 }
-}  // namespace patterns
+}  // namespace paddle::framework::ir::patterns
+namespace paddle::framework::ir {
 
 int DeleteCastOpPass::ApplyCastScatterPass(ir::Graph* graph) const {
   GraphPatternDetector gpd;
@@ -625,7 +627,8 @@ int DeleteCastOpPass::ApplyCastScatterPass(ir::Graph* graph) const {
   return found_subgraph_count;
 }
 
-namespace patterns {
+}  // namespace paddle::framework::ir
+namespace paddle::framework::ir::patterns {
 struct CastLookupTablePattern : public PatternBase {
   CastLookupTablePattern(PDPattern* pattern, const std::string& name_scope);
 
@@ -666,7 +669,8 @@ CastLookupTablePattern::CastLookupTablePattern(PDPattern* pattern,
   lookup_table->LinksFrom({lookup_table_w}).LinksTo({lookup_table_out});
   cast->LinksFrom({lookup_table_out}).LinksTo({cast_out});
 }
-}  // namespace patterns
+}  // namespace paddle::framework::ir::patterns
+namespace paddle::framework::ir {
 
 int DeleteCastOpPass::ApplyCastLookupTablePass(ir::Graph* graph) const {
   GraphPatternDetector gpd;
@@ -712,7 +716,8 @@ int DeleteCastOpPass::ApplyCastLookupTablePass(ir::Graph* graph) const {
   return found_subgraph_count;
 }
 
-namespace patterns {
+}  // namespace paddle::framework::ir
+namespace paddle::framework::ir::patterns {
 struct CastPattern : public PatternBase {
   CastPattern(PDPattern* pattern, const std::string& name_scope);
 
@@ -741,7 +746,8 @@ CastPattern::CastPattern(PDPattern* pattern, const std::string& name_scope)
 
   cast->LinksFrom({cast_in}).LinksTo({cast_out});
 }
-}  // namespace patterns
+}  // namespace paddle::framework::ir::patterns
+namespace paddle::framework::ir {
 
 int DeleteCastOpPass::ApplyCastPass(ir::Graph* graph) const {
   GraphPatternDetector gpd;
@@ -826,9 +832,7 @@ void DeleteCastOpPass::ApplyImpl(ir::Graph* graph) const {
   }
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(delete_cast_op_pass, paddle::framework::ir::DeleteCastOpPass);
 
diff --git a/paddle/fluid/framework/ir/delete_cast_op_pass_test.cc b/paddle/fluid/framework/ir/delete_cast_op_pass_test.cc
index 17f0c642a60d1..c5480db1ca466 100644
--- a/paddle/fluid/framework/ir/delete_cast_op_pass_test.cc
+++ b/paddle/fluid/framework/ir/delete_cast_op_pass_test.cc
@@ -16,9 +16,7 @@
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 void AddVarToScope(Scope* param_scope,
                    const std::string& name,
@@ -315,8 +313,6 @@ TEST(ApplyCastPass, basic) {
                         cast_num_in_graph));
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 USE_PASS(delete_cast_op_pass);
diff --git a/paddle/fluid/framework/ir/delete_dropout_op_pass_test.cc b/paddle/fluid/framework/ir/delete_dropout_op_pass_test.cc
index d8cc2210645ea..7a748a8ab8013 100644
--- a/paddle/fluid/framework/ir/delete_dropout_op_pass_test.cc
+++ b/paddle/fluid/framework/ir/delete_dropout_op_pass_test.cc
@@ -17,9 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/delete_dropout_op_pass.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 TEST(DeleteDropoutOpsPass, dropout) {
   for (std::string dropout_implementation :
@@ -89,8 +87,6 @@ TEST(DeleteDropoutOpsPass, dropout) {
   }
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 USE_PASS(delete_dropout_op_x_pass);
diff --git a/paddle/fluid/framework/ir/delete_op_device_pass.cc b/paddle/fluid/framework/ir/delete_op_device_pass.cc
index cc5523abd8e62..625a0c9023028 100644
--- a/paddle/fluid/framework/ir/delete_op_device_pass.cc
+++ b/paddle/fluid/framework/ir/delete_op_device_pass.cc
@@ -19,15 +19,11 @@ namespace phi {
 class DenseTensor;
 }  // namespace phi
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 class Scope;
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 // "op_device" attr is only used in model training. "op_device" attr will change
 // place of op kernel, so we use "delete_op_device_pass" to remove it.
@@ -50,8 +46,6 @@ void DeleteOpDevicePass::ApplyImpl(ir::Graph* graph) const {
   }
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(delete_op_device_pass, paddle::framework::ir::DeleteOpDevicePass);
diff --git a/paddle/fluid/framework/ir/delete_op_device_pass_test.cc b/paddle/fluid/framework/ir/delete_op_device_pass_test.cc
index 2b0ac27782b60..0e4f39495a338 100644
--- a/paddle/fluid/framework/ir/delete_op_device_pass_test.cc
+++ b/paddle/fluid/framework/ir/delete_op_device_pass_test.cc
@@ -16,9 +16,7 @@
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 TEST(delete_op_device_pass, relu) {
   ProgramDesc program;
@@ -44,8 +42,6 @@ TEST(delete_op_device_pass, relu) {
   }
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 USE_PASS(delete_op_device_pass);
diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc b/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc
index 2a7071d54843d..e93f1a3c9950c 100644
--- a/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc
@@ -20,9 +20,7 @@ namespace phi {
 class DenseTensor;
 }  // namespace phi
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 #define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern);
 #define GET_NODES                         \
@@ -107,9 +105,7 @@ void DeleteQuantDequantOpPass::ApplyImpl(ir::Graph* graph) const {
   AddStatis(found_count);
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(delete_quant_dequant_op_pass,
               paddle::framework::ir::DeleteQuantDequantOpPass);
diff --git a/paddle/fluid/framework/ir/dense_fc_to_sparse_pass_tester.cc b/paddle/fluid/framework/ir/dense_fc_to_sparse_pass_tester.cc
index 7fb315de928a6..9a5fc3ddc997c 100644
--- a/paddle/fluid/framework/ir/dense_fc_to_sparse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/dense_fc_to_sparse_pass_tester.cc
@@ -18,9 +18,7 @@
 #include "paddle/fluid/framework/ir/fc_fuse_pass.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 void AddVarToScope(Scope* param_scope,
                    const std::string& name,
@@ -105,9 +103,7 @@ TEST(FCFusePass, basic) {
                         num_sparse_fc_nodes_after));
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 USE_PASS(fc_fuse_pass);
 USE_PASS(dense_fc_to_sparse_pass);
diff --git a/paddle/fluid/framework/ir/dense_multihead_matmul_to_sparse_pass.cc b/paddle/fluid/framework/ir/dense_multihead_matmul_to_sparse_pass.cc
index ef123166a9fca..e9d9f5c9d8d6a 100644
--- a/paddle/fluid/framework/ir/dense_multihead_matmul_to_sparse_pass.cc
+++ b/paddle/fluid/framework/ir/dense_multihead_matmul_to_sparse_pass.cc
@@ -18,10 +18,7 @@
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
-namespace patterns {
+namespace paddle::framework::ir::patterns {
 PDNode *patterns::DenseMultiheadMatmul::operator()() {
   auto *multihead_matmul = pattern->NewNode(multihead_matmul_repr())
                                ->assert_is_op("multihead_matmul");
@@ -61,7 +58,8 @@ PDNode *patterns::DenseMultiheadMatmul::operator()() {
 
   return multihead_matmul_out;
 }
-}  // namespace patterns
+}  // namespace paddle::framework::ir::patterns
+namespace paddle::framework::ir {
 DenseMultiheadMatmulToSparsePass::DenseMultiheadMatmulToSparsePass() {
   AddOpCompat(OpCompat("multihead_matmul"))
       .AddInput("Input")
@@ -170,9 +168,7 @@ void DenseMultiheadMatmulToSparsePass::ApplyImpl(Graph *graph) const {
   AddStatis(found_multihead_matmul_count);
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(dense_multihead_matmul_to_sparse_pass,
               paddle::framework::ir::DenseMultiheadMatmulToSparsePass);
diff --git a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
index 68ec0492a42da..958ea7c272432 100644
--- a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
@@ -18,18 +18,11 @@
 
 #include "paddle/fluid/framework/op_version_registry.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 class Node;
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
-namespace paddle {
-namespace framework {
-namespace ir {
-namespace patterns {
+namespace paddle::framework::ir::patterns {
 
 static PDNode* create_emb_vars(PDPattern* pattern,
                                const std::string& name,
@@ -139,7 +132,8 @@ void SkipLayerNorm::operator()() {
       .LinksTo({layer_norm_out, layer_norm_mean_var, layer_norm_variance_var});
 }
 
-}  // namespace patterns
+}  // namespace paddle::framework::ir::patterns
+namespace paddle::framework::ir {
 
 int EmbeddingEltwiseLayerNormFusePass::BuildFusion(
     Graph* graph, const std::string& name_scope
@@ -474,9 +468,7 @@ void EmbeddingEltwiseLayerNormFusePass::ApplyImpl(Graph* graph) const {
   AddStatis(fusion_count);
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(embedding_eltwise_layernorm_fuse_pass,
               paddle::framework::ir::EmbeddingEltwiseLayerNormFusePass);
diff --git a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass_tester.cc b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass_tester.cc
index 05d43788fb20d..a9b406ed5d7ac 100644
--- a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass_tester.cc
@@ -17,9 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 TEST(FCElementwiseLayerNormFusePass, basic) {
   // inputs                           operator            output
@@ -72,8 +70,6 @@ TEST(FCElementwiseLayerNormFusePass, basic) {
                         num_fused_nodes_after));
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 USE_PASS(fc_elementwise_layernorm_fuse_pass);
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc
index a037793e59190..e4c5b13d90a23 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
@@ -19,9 +19,7 @@
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 FCFusePass::FCFusePass() {
   AddOpCompat(OpCompat("mul"))
@@ -301,9 +299,7 @@ int FCFusePass::ApplyFCPattern(Graph* graph, bool with_relu) const {
   return found_fc_count;
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(fc_fuse_pass, paddle::framework::ir::FCFusePass)
     .RequirePassAttr("use_gpu");
diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass_tester.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass_tester.cc
index 876e949fdc3d3..ef1044a60ab98 100644
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass_tester.cc
@@ -14,11 +14,7 @@
 
 #include "paddle/fluid/framework/ir/fc_lstm_fuse_pass_tester.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
-
-namespace fc_lstm_test {
+namespace paddle::framework::ir::fc_lstm_test {
 
 TEST(FcLstmFusePass, basic) {
   std::unique_ptr<ir::Graph> graph = PrepareGraph();
@@ -50,9 +46,6 @@ TEST(FcLstmFusePass, basic) {
                         "The number of fusion_gru nodes does "
                         "not meet expectations after fuse"));
 }
-}  // namespace fc_lstm_test
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir::fc_lstm_test
 
 USE_PASS(fc_lstm_fuse_pass);
diff --git a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
index 3c550ca84042d..60d83f0b5edfb 100644
--- a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
@@ -193,6 +193,8 @@ ir::Graph *FuseElewiseAddActPass::FuseElewiseAddActInplaceGrad(
     desc.SetOutput(GradVarName("IntermediateOut"),
                    std::vector<std::string>({d_itermediate_out_n}));
 
+    desc.SetAttr("axis", -1);
+    desc.SetAttr("scale", 0.0f);
     desc.SetAttr("save_intermediate_out", false);
     desc.SetAttr("functor_list",
                  std::vector<std::string>(
@@ -273,6 +275,8 @@ ir::Graph *FuseElewiseAddActPass::FuseActElewiseAddInplaceGrad(
     desc.SetOutput(GradVarName("IntermediateOut"),
                    std::vector<std::string>({d_intermediate_var_n}));
 
+    desc.SetAttr("axis", -1);
+    desc.SetAttr("scale", 0.0f);
     desc.SetAttr("save_intermediate_out", false);
     desc.SetAttr("functor_list",
                  std::vector<std::string>({ele_add_grad_op->Op()->Type(),
@@ -315,6 +319,8 @@ Node *FuseElewiseAddActPass::CreateFuseElewiseAddActNode(
   desc.SetOutput("Out", std::vector<std::string>({act_out_n}));
   desc.SetOutput("IntermediateOut", std::vector<std::string>({ele_out_n}));
   desc.SetType("fused_elemwise_add_activation");
+  desc.SetAttr("axis", -1);
+  desc.SetAttr("scale", 0.0f);
   desc.SetAttr("save_intermediate_out", true);
   desc.SetAttr(
       "functor_list",
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc
index 15c5b0b379b13..9ba4b6d9d816d 100644
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc
@@ -22,9 +22,7 @@
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/platform/enforce.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 class Node;
 
@@ -342,8 +340,6 @@ class FuseAdamOpPass : public FuseOptimizerOpPass {
     return scale_node;
   }
 };
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(fuse_adam_op_pass, paddle::framework::ir::FuseAdamOpPass);
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc
index d24322ede7e75..523b2a2e5eaf2 100644
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc
@@ -21,9 +21,7 @@
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/platform/enforce.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 class Node;
 
@@ -113,8 +111,6 @@ class FuseMomentumOpPass : public FuseOptimizerOpPass {
   }
 };
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(fuse_momentum_op_pass, paddle::framework::ir::FuseMomentumOpPass);
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
index 4a9e316f30b2b..52dee2bef4e64 100644
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
@@ -18,9 +18,7 @@
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/phi/core/kernel_factory.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
   ir::Graph &result = *graph;
@@ -688,6 +686,4 @@ void FuseOptimizerOpPass::InsertInputAndOutputForFusedOpNode(
     graph->RemoveNode(ctrl_var_node);
   }
 }
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_sgd_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_sgd_op_pass.cc
index a3ec33d8b2fb6..cefcb56634aba 100644
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_sgd_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_sgd_op_pass.cc
@@ -20,9 +20,7 @@
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/platform/enforce.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 class Node;
 
@@ -68,8 +66,6 @@ class FuseSgdOpPass : public FuseOptimizerOpPass {
     return graph->CreateOpNode(&Sgd_desc);
   }
 };
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(fuse_sgd_op_pass, paddle::framework::ir::FuseSgdOpPass);
diff --git a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc
index d08da8813f17b..d179547c1b409 100644
--- a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc
@@ -22,9 +22,7 @@
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/enforce.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 void FuseReluDepthwiseConvPass::ApplyImpl(ir::Graph *graph) const {
   graph = FuseReluDepthwiseConv(graph, true);
@@ -186,9 +184,7 @@ ir::Graph *FuseReluDepthwiseConvPass::FuseReluDepthwiseConv(
   return graph;
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(fuse_relu_depthwise_conv_pass,
               paddle::framework::ir::FuseReluDepthwiseConvPass);
diff --git a/paddle/fluid/framework/ir/fused_multi_transformer_decoder_pass.cc b/paddle/fluid/framework/ir/fused_multi_transformer_decoder_pass.cc
index db281b64f9299..619206f77df50 100644
--- a/paddle/fluid/framework/ir/fused_multi_transformer_decoder_pass.cc
+++ b/paddle/fluid/framework/ir/fused_multi_transformer_decoder_pass.cc
@@ -20,16 +20,11 @@
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 class Scope;
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
 
-namespace paddle {
-namespace framework {
-namespace ir {
-namespace patterns {
+namespace paddle::framework::ir::patterns {
 
 static const std::unordered_set<std::string> FFN_ACTS{"relu", "gelu"};
 
@@ -1089,7 +1084,8 @@ PDNode* MultiDevicesFusedMultiTransformerDecoderFuseQKVPattern::operator()() {
   return ffn_output;
 }
 
-}  // namespace patterns
+}  // namespace paddle::framework::ir::patterns
+namespace paddle::framework::ir {
 
 inline Node* CreatePersistableVarNode(Graph* graph, const std::string& name) {
   auto var_desc = VarDesc(name);
@@ -3361,9 +3357,7 @@ MultiDevicesFusedMultiTransformerDecoderFuseQKVPass::
       .End();
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(fused_multi_transformer_decoder_pass,
               paddle::framework::ir::FusedMultiTransformerDecoderPass);
diff --git a/paddle/fluid/framework/ir/fused_multi_transformer_decoder_pass_tester.cc b/paddle/fluid/framework/ir/fused_multi_transformer_decoder_pass_tester.cc
index 6c08bd2941ff1..f38534468337b 100644
--- a/paddle/fluid/framework/ir/fused_multi_transformer_decoder_pass_tester.cc
+++ b/paddle/fluid/framework/ir/fused_multi_transformer_decoder_pass_tester.cc
@@ -15,9 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 void AddVarToScope(Scope* param_scope,
                    const std::string& name,
@@ -550,9 +548,7 @@ TEST(MultiDevicesFusedMultiTransformerDecoderFuseQKVPass,
               "multi_devices_fused_multi_transformer_decoder_fuse_qkv_pass"));
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 USE_PASS(fused_multi_transformer_decoder_pass);
 USE_PASS(fused_multi_transformer_decoder_fuse_qkv_pass);
diff --git a/paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass_tester.cc b/paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass_tester.cc
index 3b4f475df5f36..370cb3e73bcbd 100644
--- a/paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass_tester.cc
+++ b/paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass_tester.cc
@@ -17,9 +17,7 @@ limitations under the License. */
 #ifndef UNUSED
 #define UNUSED __attribute__((unused))
 #endif
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 void AddVarToScope(Scope* param_scope,
                    const std::string& name,
@@ -711,9 +709,7 @@ TEST(MultiDevicesFusedMultiTransformerEncoderFuseQKVPass,
               "multi_devices_fused_multi_transformer_encoder_fuse_qkv_pass"));
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 USE_PASS(fused_multi_transformer_encoder_pass);
 USE_PASS(fused_multi_transformer_encoder_fuse_qkv_pass);
diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator.cc b/paddle/fluid/framework/ir/fusion_group/code_generator.cc
index 2e5c2b5be4ac3..defc320495064 100644
--- a/paddle/fluid/framework/ir/fusion_group/code_generator.cc
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator.cc
@@ -17,10 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/fusion_group/code_generator_helper.h"
 #include "paddle/fluid/framework/ir/fusion_group/cuda_resources.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
-namespace fusion_group {
+namespace paddle::framework::ir::fusion_group {
 
 std::string ExtractDataType(const std::vector<Node*>& nodes) {
   std::string dtype_str = "";
@@ -373,7 +370,4 @@ std::unordered_map<Node*, int> CodeGenerator::EncodeVarNodes(
   return var_ids;
 }
 
-}  // namespace fusion_group
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir::fusion_group
diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
index b66ee6f31cdb0..799b18e3e4fc7 100644
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -26,9 +26,7 @@ PADDLE_DEFINE_EXPORTED_bool(all_blocks_convert_trt,
                             false,
                             "Convert all blocks'Ops into TensorRT Ops");
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 Graph::Graph(const ProgramDesc &program)
     : Graph(
@@ -402,6 +400,4 @@ std::unique_ptr<Graph> Graph::CloneSubGraph(const size_t idx) {
 bool IsControlDepVar(const ir::Node &var) {
   return var.Name().find(ir::Node::kControlDepVarName) != std::string::npos;
 }
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc
index 53e2697daa868..46e8e91971ba0 100644
--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -40,9 +40,7 @@ PADDLE_DEFINE_EXPORTED_string(print_sub_graph_dir,
                               "FLAGS_print_sub_graph_dir is used "
                               "to print the nodes of sub_graphs.");
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 namespace {
 
 template <class NodeComparator = ir::NodeComp>
@@ -964,6 +962,4 @@ std::vector<std::vector<std::vector<ir::Node::Dep>>> GetOpDependencies(
   return deps;
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
diff --git a/paddle/fluid/framework/ir/graph_helper_test.cc b/paddle/fluid/framework/ir/graph_helper_test.cc
index 5972cd40817ac..c77b694e90f24 100644
--- a/paddle/fluid/framework/ir/graph_helper_test.cc
+++ b/paddle/fluid/framework/ir/graph_helper_test.cc
@@ -18,9 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/program_desc.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 void BuildCircleGraph(Graph* g) {
   ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
@@ -222,6 +220,4 @@ TEST(GraphHelperTest, GraphNum) {
   ASSERT_EQ(GraphNum(g3), 2UL);
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
diff --git a/paddle/fluid/framework/ir/graph_to_program_pass.cc b/paddle/fluid/framework/ir/graph_to_program_pass.cc
index 7d0cb815c9af7..61a971dd9501f 100644
--- a/paddle/fluid/framework/ir/graph_to_program_pass.cc
+++ b/paddle/fluid/framework/ir/graph_to_program_pass.cc
@@ -19,15 +19,11 @@ limitations under the License. */
 #include "paddle/common/flags.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 class ProgramDesc;
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 void GraphToProgramPass::ApplyImpl(ir::Graph* graph) const {
   auto& program = Get<ProgramDesc>("program");
@@ -39,8 +35,6 @@ void GraphToProgramPass::ApplyImpl(ir::Graph* graph) const {
   }
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(graph_to_program_pass, paddle::framework::ir::GraphToProgramPass);
diff --git a/paddle/fluid/framework/ir/groupnorm_act_pass.cc b/paddle/fluid/framework/ir/groupnorm_act_pass.cc
index 397a7437757cc..ff2df5887d6f3 100644
--- a/paddle/fluid/framework/ir/groupnorm_act_pass.cc
+++ b/paddle/fluid/framework/ir/groupnorm_act_pass.cc
@@ -19,18 +19,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 class Node;
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
-namespace paddle {
-namespace framework {
-namespace ir {
-namespace patterns {
+namespace paddle::framework::ir::patterns {
 
 struct GroupNormAct : public PatternBase {
   GroupNormAct(PDPattern *pattern, const std::string &name_scope)
@@ -80,7 +73,8 @@ void GroupNormAct::operator()(PDNode *x) {
   act->LinksFrom({group_norm_out_var}).LinksTo({act_out});
 }
 
-}  // namespace patterns
+}  // namespace paddle::framework::ir::patterns
+namespace paddle::framework::ir {
 
 int GroupNormActFusePass::ApplyGNSiluPattern(ir::Graph *graph) const {
   PADDLE_ENFORCE_NOT_NULL(
@@ -155,9 +149,7 @@ void GroupNormActFusePass::ApplyImpl(ir::Graph *graph) const {
   AddStatis(found_subgraph_count);
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(groupnorm_act_pass, paddle::framework::ir::GroupNormActFusePass);
 REGISTER_PASS_CAPABILITY(groupnorm_act_pass)
diff --git a/paddle/fluid/framework/ir/is_test_pass.cc b/paddle/fluid/framework/ir/is_test_pass.cc
index 338e8227228d4..aceea7aae8d82 100644
--- a/paddle/fluid/framework/ir/is_test_pass.cc
+++ b/paddle/fluid/framework/ir/is_test_pass.cc
@@ -16,9 +16,7 @@ limitations under the License. */
 
 #include "glog/logging.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 class Graph;
 
@@ -52,8 +50,6 @@ void IsTestPass::ApplyImpl(ir::Graph* graph) const {
   }
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(is_test_pass, paddle::framework::ir::IsTestPass);
diff --git a/paddle/fluid/framework/ir/matmul_scale_fuse_pass.cc b/paddle/fluid/framework/ir/matmul_scale_fuse_pass.cc
index c9c9a271d439b..a197af6e41c1f 100644
--- a/paddle/fluid/framework/ir/matmul_scale_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/matmul_scale_fuse_pass.cc
@@ -22,9 +22,7 @@
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 class Node;
 
@@ -237,9 +235,7 @@ void MatmulV2ScaleFusePass::ApplyImpl(ir::Graph* graph) const {
   AddStatis(found_count);
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(matmul_scale_fuse_pass,
               paddle::framework::ir::MatmulScaleFusePass);
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/conditional_block_op_eager_deletion_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/conditional_block_op_eager_deletion_pass.cc
index 97588757280cf..817ab6f3f85ef 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/conditional_block_op_eager_deletion_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/conditional_block_op_eager_deletion_pass.cc
@@ -17,9 +17,7 @@
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h"
 #include "paddle/fluid/operators/controlflow/op_variant.h"
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 using OpVariant = operators::OpVariant;
 class ConditionalOpEagerDeletionPass : public Pass {
  protected:
@@ -94,9 +92,7 @@ class ConditionalOpEagerDeletionPass : public Pass {
   }
 };
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(conditional_block_op_eager_deletion_pass,
               paddle::framework::ir::ConditionalOpEagerDeletionPass);
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.cc b/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.cc
index 924938c7d00cb..f01d87025a343 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.cc
@@ -14,17 +14,11 @@
 
 #include "paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h"
 
-namespace paddle {
-namespace framework {
-namespace details {
+namespace paddle::framework::details {
 class OpHandleBase;
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::details
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 OpGraphView::OpGraphView(const std::vector<details::OpHandleBase *> &ops)
     : preceding_ops_(), pending_ops_() {
@@ -94,6 +88,4 @@ OpGraphView::GetPrecedingDepNum() const {
 
 size_t OpGraphView::OpNumber() const { return preceding_ops_.size(); }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
diff --git a/paddle/fluid/framework/ir/merge_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/merge_layernorm_fuse_pass.cc
index 2e6aaa37808ae..1fbe22ff33021 100644
--- a/paddle/fluid/framework/ir/merge_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/merge_layernorm_fuse_pass.cc
@@ -39,9 +39,7 @@
   GET_IR_NODE(layernorm_40_in_bias);  \
   GET_IR_NODE(layernorm_40_in_scale); \
   GET_IR_NODE(layernorm_40_out);
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 MergeLayernormFusePass::MergeLayernormFusePass() {
   AddOpCompat(OpCompat("reshape2"))
       .AddInput("X")
@@ -176,9 +174,7 @@ void MergeLayernormFusePass::ApplyImpl(ir::Graph* graph) const {
   gpd(graph, handler);
   AddStatis(fusion_count);
 }
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 REGISTER_PASS(merge_layernorm_fuse_pass,
               paddle::framework::ir::MergeLayernormFusePass);
 REGISTER_PASS_CAPABILITY(merge_layernorm_fuse_pass)
diff --git a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
index e35e5d297db9b..5ffdaee331c6d 100644
--- a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
+++ b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
@@ -19,9 +19,7 @@
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 static const char kNumRepeats[] = "num_repeats";  // NOLINT
 typedef std::unordered_map<std::string, std::vector<ir::Node*>> SSAVarList;
@@ -335,9 +333,7 @@ void BatchMergePass::ApplyImpl(ir::Graph* graph) const {
   }
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(multi_batch_merge_pass, paddle::framework::ir::BatchMergePass)
     .RequirePassAttr(paddle::framework::ir::kNumRepeats);
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/add_reader_dependency_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/add_reader_dependency_pass.cc
index b907869b4a38e..e0b96b69116a4 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/add_reader_dependency_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/add_reader_dependency_pass.cc
@@ -16,9 +16,7 @@
 
 #include "paddle/fluid/framework/ir/pass.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 class Graph;
 
@@ -106,9 +104,7 @@ void AddReaderDependencyPass::ApplyImpl(Graph *graph) const {
   }
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(add_reader_dependency_pass,
               paddle::framework::ir::AddReaderDependencyPass);
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/backward_optimizer_op_deps_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/backward_optimizer_op_deps_pass.cc
index a4feed4693a62..1913888dc316e 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/backward_optimizer_op_deps_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/backward_optimizer_op_deps_pass.cc
@@ -18,9 +18,7 @@
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 class BackWardOpDepsPass : public ir::Pass {
  protected:
@@ -195,9 +193,7 @@ class BackWardOpDepsPass : public ir::Pass {
     }
   }
 };
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(backward_optimizer_op_deps_pass,
               paddle::framework::ir::BackWardOpDepsPass);
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/fix_op_run_order_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/fix_op_run_order_pass.cc
index d7d18f6e8469c..1c3e4c03e561f 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/fix_op_run_order_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/fix_op_run_order_pass.cc
@@ -24,9 +24,7 @@
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/utils/string/string_helper.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 static std::string kSep(1, static_cast<char>(1));  // NOLINT
 
@@ -269,8 +267,6 @@ class FixOpRunOrderPass : public Pass {
   }
 };
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(fix_op_run_order_pass, paddle::framework::ir::FixOpRunOrderPass);
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
index cc20f52180871..b16548c545ef0 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
@@ -41,9 +41,7 @@
 #include "paddle/fluid/framework/details/sparse_all_reduce_op_handle.h"
 #endif
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 namespace {
 // TODO(panyx0718): Clean this up as well.
@@ -1377,9 +1375,7 @@ static int MultiDevSSAGraphBuilderRegister(const std::string &builder_mode) {
   return 0;
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 #define REGISTER_MULTI_DEVICES_PASS(pass_name, pass_class)                \
   STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.cc
index f4f0e393c2499..72e8baaba5017 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.cc
@@ -17,9 +17,7 @@
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 static std::unordered_set<std::string> ReaderOpSet() {
   return {"create_py_reader"};
@@ -78,6 +76,4 @@ void SetReaderOpDeviceInfo(Graph *graph, size_t dev_cnt, size_t dev_idx) {
   VLOG(10) << "Found op number " << found_op_num;
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
index ebf273a8d1c2e..008f1e95cd4f7 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
@@ -22,16 +22,11 @@
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/phi/common/data_type.h"
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 class Scope;
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
 
-namespace paddle {
-namespace framework {
-namespace ir {
-namespace patterns {
+namespace paddle::framework::ir::patterns {
 
 static void ReplaceOutputVar(Node* op, Node* old_var, Node* new_var) {
   if (op->IsOp() && op->Op()) {
@@ -635,7 +630,8 @@ PDNode* MultiHeadMatmulV3Pattern::operator()() {
 
   return transpose2_2_out_var;
 }
-}  // namespace patterns
+}  // namespace paddle::framework::ir::patterns
+namespace paddle::framework::ir {
 
 namespace {
 template <typename T>
@@ -1615,9 +1611,7 @@ void MultiHeadMatmulV3FusePass::ApplyImpl(Graph* graph) const {
   AddStatis(fusion_count);
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(multihead_matmul_fuse_pass,
               paddle::framework::ir::MultiHeadMatmulFusePass);
diff --git a/paddle/fluid/framework/ir/multihead_matmul_roformer_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_roformer_fuse_pass.cc
index 1f91b6955aadf..2bcc0de82c015 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_roformer_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_roformer_fuse_pass.cc
@@ -19,10 +19,7 @@
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
-namespace patterns {
+namespace paddle::framework::ir::patterns {
 
 static void ReplaceOutputVar(Node* op, Node* old_var, Node* new_var) {
   if (op->IsOp() && op->Op()) {
@@ -310,7 +307,8 @@ PDNode* MultiHeadMatmulRoformerPattern::operator()() {
 
   return transpose2_2_out_var;
 }
-}  // namespace patterns
+}  // namespace paddle::framework::ir::patterns
+namespace paddle::framework::ir {
 
 MultiHeadMatmulRoformerFusePass::MultiHeadMatmulRoformerFusePass() {
   AddOpCompat(OpCompat("elementwise_add"))
@@ -758,9 +756,7 @@ void MultiHeadMatmulRoformerFusePass::ApplyImpl(Graph* graph) const {
   AddStatis(fusion_count);
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(multihead_matmul_roformer_fuse_pass,
               paddle::framework::ir::MultiHeadMatmulRoformerFusePass);
diff --git a/paddle/fluid/framework/ir/node_test.cc b/paddle/fluid/framework/ir/node_test.cc
index 2d84162e13aa6..26ca77f3b00ce 100644
--- a/paddle/fluid/framework/ir/node_test.cc
+++ b/paddle/fluid/framework/ir/node_test.cc
@@ -17,9 +17,7 @@ limitations under the License. */
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/var_desc.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 class Node;
 
@@ -103,6 +101,4 @@ TEST(NodeTest, ToString) {
   EXPECT_EQ(n3->ToString(), "{n2} = n3(n1)");
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
diff --git a/paddle/fluid/framework/ir/onednn/compute_propagate_scales_onednn_pass.cc b/paddle/fluid/framework/ir/onednn/compute_propagate_scales_onednn_pass.cc
index 1c733636ca7b0..cf17f00fa4080 100644
--- a/paddle/fluid/framework/ir/onednn/compute_propagate_scales_onednn_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/compute_propagate_scales_onednn_pass.cc
@@ -21,9 +21,7 @@
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 void ComputePropagateScalesMkldnnPass::GetTensorFromVector(
     const std::vector<float>& data_v, phi::DenseTensor* tensor) const {
@@ -516,9 +514,7 @@ void ComputePropagateScalesMkldnnPass::ApplyImpl(ir::Graph* graph) const {
       graph, "has_quant_info", "var_quant_scales", var_quant_scales);
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(compute_propagate_scales_onednn_pass,
               paddle::framework::ir::ComputePropagateScalesMkldnnPass);
diff --git a/paddle/fluid/framework/ir/onednn/compute_propagate_scales_onednn_pass_tester.cc b/paddle/fluid/framework/ir/onednn/compute_propagate_scales_onednn_pass_tester.cc
index 9664647fd4214..f8cc1ca17c99a 100644
--- a/paddle/fluid/framework/ir/onednn/compute_propagate_scales_onednn_pass_tester.cc
+++ b/paddle/fluid/framework/ir/onednn/compute_propagate_scales_onednn_pass_tester.cc
@@ -19,9 +19,7 @@
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/phi/common/place.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 const std::array<float, 10> positive_and_negative_values = {-0.0482659,
                                                             -0.0102493,
@@ -347,6 +345,4 @@ TEST_F(ComputePropagateScalesMkldnnPassTest, update_relu_output_scales) {
       BuildConv2dReluProgramDesc(), &var_quant_scales, {"conv_out"});
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
diff --git a/paddle/fluid/framework/ir/onednn/conv_activation_onednn_fuse_pass.cc b/paddle/fluid/framework/ir/onednn/conv_activation_onednn_fuse_pass.cc
index 61c0457f7c740..bfe0296640dfb 100644
--- a/paddle/fluid/framework/ir/onednn/conv_activation_onednn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/conv_activation_onednn_fuse_pass.cc
@@ -17,9 +17,7 @@
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/utils/string/pretty_log.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 using string::PrettyLogDetail;
 
@@ -373,9 +371,7 @@ ConvActivationMkldnnFusePass::ConvActivationMkldnnFusePass() {
       .End();
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(conv_activation_onednn_fuse_pass,
               paddle::framework::ir::ConvActivationMkldnnFusePass);
diff --git a/paddle/fluid/framework/ir/onednn/conv_affine_channel_onednn_fuse_pass.cc b/paddle/fluid/framework/ir/onednn/conv_affine_channel_onednn_fuse_pass.cc
index 5ee6e361bcc92..e04314f399be5 100644
--- a/paddle/fluid/framework/ir/onednn/conv_affine_channel_onednn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/conv_affine_channel_onednn_fuse_pass.cc
@@ -23,15 +23,11 @@ namespace phi {
 class DenseTensor;
 }  // namespace phi
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 class Scope;
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 class Node;
 
@@ -309,9 +305,7 @@ void ConvAffineChannelFusePass::FuseConvAffineChannel(
   AddStatis(found_conv_ac_count);
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(conv_affine_channel_onednn_fuse_pass,
               paddle::framework::ir::ConvAffineChannelFusePass);
diff --git a/paddle/fluid/framework/ir/onednn/conv_bias_onednn_fuse_pass.cc b/paddle/fluid/framework/ir/onednn/conv_bias_onednn_fuse_pass.cc
index 1cf663d13deef..c63cf3ed74b2f 100644
--- a/paddle/fluid/framework/ir/onednn/conv_bias_onednn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/conv_bias_onednn_fuse_pass.cc
@@ -22,9 +22,7 @@
 #include "paddle/phi/core/enforce.h"
 #include "paddle/utils/string/pretty_log.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 ConvBiasFusePass::ConvBiasFusePass() {
   AddOpCompat(OpCompat("conv2d"))
@@ -445,9 +443,7 @@ void ConvBiasFusePass::FuseConvBias(ir::Graph* graph,
   }
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 REGISTER_PASS(conv_bias_onednn_fuse_pass,
               paddle::framework::ir::ConvBiasFusePass);
 REGISTER_PASS_CAPABILITY(conv_bias_onednn_fuse_pass)
diff --git a/paddle/fluid/framework/ir/onednn/conv_elementwise_add_onednn_fuse_pass.cc b/paddle/fluid/framework/ir/onednn/conv_elementwise_add_onednn_fuse_pass.cc
index 7733730f7d605..14857f3c550d8 100644
--- a/paddle/fluid/framework/ir/onednn/conv_elementwise_add_onednn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/conv_elementwise_add_onednn_fuse_pass.cc
@@ -19,9 +19,7 @@
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/utils/string/pretty_log.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 ResidualConnectionMKLDNNFusePass::ResidualConnectionMKLDNNFusePass() {
   AddOpCompat(OpCompat("conv2d"))
@@ -305,9 +303,7 @@ void ResidualConnectionMKLDNNFusePass::ApplyImpl(ir::Graph* graph) const {
 
   AddStatis(graph_with_stats.second);
 }
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(conv_elementwise_add_onednn_fuse_pass,
               paddle::framework::ir::ResidualConnectionMKLDNNFusePass);
diff --git a/paddle/fluid/framework/ir/onednn/cpu_bfloat16_pass_tester.cc b/paddle/fluid/framework/ir/onednn/cpu_bfloat16_pass_tester.cc
index c31e59b39216a..e5d2ae598b81d 100644
--- a/paddle/fluid/framework/ir/onednn/cpu_bfloat16_pass_tester.cc
+++ b/paddle/fluid/framework/ir/onednn/cpu_bfloat16_pass_tester.cc
@@ -17,9 +17,7 @@
 #include "paddle/fluid/framework/ir/onednn/cpu_bfloat16_pass.h"
 #include "paddle/fluid/imperative/type_defs.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 void SetOp(ProgramDesc* prog,
            const std::string& type,
@@ -230,8 +228,6 @@ TEST(CpuBfloat16Pass, double_outputs_ops) {
            added_nodes);
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 USE_PASS(cpu_bfloat16_pass);
diff --git a/paddle/fluid/framework/ir/onednn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/onednn/cpu_quantize_pass.cc
index a512f4b8021f4..a7256cdfe9404 100644
--- a/paddle/fluid/framework/ir/onednn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/cpu_quantize_pass.cc
@@ -22,9 +22,7 @@
 #include "paddle/fluid/platform/onednn_helper.h"
 #include "paddle/utils/string/pretty_log.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 using EigenVectorArrayMap = Eigen::Map<Eigen::Array<double, Eigen::Dynamic, 1>>;
 using EigenVectorArrayMapFloat =
@@ -1313,9 +1311,7 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
   QuantizeFusionLSTM(graph);
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(cpu_quantize_pass, paddle::framework::ir::CPUQuantizePass)
     .RequirePassAttr("quant_var_scales");
diff --git a/paddle/fluid/framework/ir/onednn/elementwise_act_onednn_fuse_pass.cc b/paddle/fluid/framework/ir/onednn/elementwise_act_onednn_fuse_pass.cc
index 3f0423870d366..427de4d610754 100644
--- a/paddle/fluid/framework/ir/onednn/elementwise_act_onednn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/elementwise_act_onednn_fuse_pass.cc
@@ -20,9 +20,7 @@
 #include "paddle/phi/core/enforce.h"
 #include "paddle/utils/string/pretty_log.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 using string::PrettyLogDetail;
 
@@ -81,9 +79,7 @@ void ElementwiseActivationOneDNNPass::FuseElementwiseAct(
                     act_type);
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(elementwise_act_onednn_fuse_pass,
               paddle::framework::ir::ElementwiseActivationOneDNNPass);
diff --git a/paddle/fluid/framework/ir/onednn/int8_scale_calculation_onednn_pass.cc b/paddle/fluid/framework/ir/onednn/int8_scale_calculation_onednn_pass.cc
index 499a7734d71d6..33b512dc7669c 100644
--- a/paddle/fluid/framework/ir/onednn/int8_scale_calculation_onednn_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/int8_scale_calculation_onednn_pass.cc
@@ -19,9 +19,7 @@
 #include "paddle/fluid/platform/onednn_helper.h"
 #include "paddle/phi/core/enforce.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 Int8ScaleCalculationMkldnnPass::Int8ScaleCalculationMkldnnPass() {  // NOLINT
   AddOpCompat(OpCompat("conv2d"))
@@ -210,9 +208,7 @@ void Int8ScaleCalculationMkldnnPass::Int8ScaleImpl(
   AddStatis(found_int8_scales_count);
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(int8_scale_calculation_onednn_pass,
               paddle::framework::ir::Int8ScaleCalculationMkldnnPass);
diff --git a/paddle/fluid/framework/ir/onednn/interpolate_onednn_pass.cc b/paddle/fluid/framework/ir/onednn/interpolate_onednn_pass.cc
index 8f384931a589c..ad0add6dd3c0c 100644
--- a/paddle/fluid/framework/ir/onednn/interpolate_onednn_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/interpolate_onednn_pass.cc
@@ -19,15 +19,11 @@
 
 #include "paddle/phi/core/enforce.h"
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 class OpDesc;
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 class Graph;
 
@@ -65,9 +61,7 @@ void InterpolateOneDNNPass::ApplyImpl(ir::Graph* graph) const {
   AddStatis(found_count);
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(interpolate_onednn_pass,
               paddle::framework::ir::InterpolateOneDNNPass);
diff --git a/paddle/fluid/framework/ir/onednn/multi_gru_seq_fuse_pass.cc b/paddle/fluid/framework/ir/onednn/multi_gru_seq_fuse_pass.cc
index 214b8e12fd0b1..63dc9cd677a4f 100644
--- a/paddle/fluid/framework/ir/onednn/multi_gru_seq_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/multi_gru_seq_fuse_pass.cc
@@ -26,9 +26,7 @@
 #include "paddle/fluid/platform/onednn_helper.h"
 #include "paddle/utils/string/pretty_log.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 using EigenVectorArrayMap = Eigen::Map<Eigen::Array<double, Eigen::Dynamic, 1>>;
 using string::PrettyLogDetail;
@@ -179,9 +177,7 @@ MultiGruSeqFusePass::MultiGruSeqFusePass() {
       .End();
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(multi_gru_seq_fuse_pass,
               paddle::framework::ir::MultiGruSeqFusePass);
diff --git a/paddle/fluid/framework/ir/onednn/operator_reshape2_onednn_fuse_pass.cc b/paddle/fluid/framework/ir/onednn/operator_reshape2_onednn_fuse_pass.cc
index a21ddd579be3c..f937a1c681b17 100644
--- a/paddle/fluid/framework/ir/onednn/operator_reshape2_onednn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/operator_reshape2_onednn_fuse_pass.cc
@@ -19,9 +19,7 @@
 #include "paddle/phi/backends/onednn/onednn_reuse.h"
 #include "paddle/utils/string/pretty_log.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 using string::PrettyLogDetail;
 
@@ -132,9 +130,7 @@ void FuseOperatorReshape2OneDNNPass::FuseReshape2(Graph *graph,
                     op_type);
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(operator_reshape2_onednn_fuse_pass,
               paddle::framework::ir::FuseOperatorReshape2OneDNNPass);
diff --git a/paddle/fluid/framework/ir/onednn/params_quantization_onednn_pass_tester.cc b/paddle/fluid/framework/ir/onednn/params_quantization_onednn_pass_tester.cc
index 36ff2110e582f..716419434933d 100755
--- a/paddle/fluid/framework/ir/onednn/params_quantization_onednn_pass_tester.cc
+++ b/paddle/fluid/framework/ir/onednn/params_quantization_onednn_pass_tester.cc
@@ -18,9 +18,7 @@
 #include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/phi/common/place.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 namespace {
 struct Data {
   Data() = default;
@@ -380,8 +378,6 @@ TEST_F(ParamsQuantizationMkldnnPassTestFixture, conv_with_bias_2g2o2i1h1ws) {
 }
 
 }  // namespace
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 USE_PASS(params_quantization_onednn_pass);
diff --git a/paddle/fluid/framework/ir/onednn/reshape_transpose_matmul_onednn_fuse_pass.cc b/paddle/fluid/framework/ir/onednn/reshape_transpose_matmul_onednn_fuse_pass.cc
index f3250c32604c6..3b22bea8205b4 100644
--- a/paddle/fluid/framework/ir/onednn/reshape_transpose_matmul_onednn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/reshape_transpose_matmul_onednn_fuse_pass.cc
@@ -18,9 +18,7 @@
 #include "paddle/phi/core/enforce.h"
 #include "paddle/utils/string/pretty_log.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 void ReshapeTransposeMatmulMkldnnFusePass::ApplyImpl(Graph *graph) const {
   auto matmul_types = {"matmul", "matmul_v2", "fused_matmul"};
@@ -264,9 +262,7 @@ ReshapeTransposeMatmulMkldnnFusePass::ReshapeTransposeMatmulMkldnnFusePass() {
       .End();
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(reshape_transpose_matmul_onednn_fuse_pass,
               paddle::framework::ir::ReshapeTransposeMatmulMkldnnFusePass);
diff --git a/paddle/fluid/framework/ir/onednn/scale_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/onednn/scale_matmul_fuse_pass.cc
index 7ae647c6d28f7..7c8930f9fccc8 100644
--- a/paddle/fluid/framework/ir/onednn/scale_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/scale_matmul_fuse_pass.cc
@@ -22,9 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/onednn_helper.h"
 #include "paddle/utils/string/pretty_log.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 class Graph;
 
@@ -141,9 +139,7 @@ void ScaleMatmulFusePass::ApplyImpl(ir::Graph* graph) const {
                     found_scale_matmul_fuse_count);
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(scale_matmul_fuse_pass,
               paddle::framework::ir::ScaleMatmulFusePass);
diff --git a/paddle/fluid/framework/ir/onednn/shuffle_channel_onednn_detect_pass.cc b/paddle/fluid/framework/ir/onednn/shuffle_channel_onednn_detect_pass.cc
index 7bce1813fed8a..2ae5301258d65 100644
--- a/paddle/fluid/framework/ir/onednn/shuffle_channel_onednn_detect_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/shuffle_channel_onednn_detect_pass.cc
@@ -18,9 +18,7 @@
 
 #include "paddle/fluid/framework/op_version_registry.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 #define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern);
 #define GET_NODES             \
@@ -231,9 +229,7 @@ void ShuffleChannelMKLDNNDetectPass::ApplyImpl(ir::Graph* graph) const {
   gpd(graph, handler);
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(shuffle_channel_onednn_detect_pass,
               paddle::framework::ir::ShuffleChannelMKLDNNDetectPass);
diff --git a/paddle/fluid/framework/ir/onednn/shuffle_channel_onednn_detect_pass_tester.cc b/paddle/fluid/framework/ir/onednn/shuffle_channel_onednn_detect_pass_tester.cc
index da389d3a1353c..4cfa4c637bc34 100644
--- a/paddle/fluid/framework/ir/onednn/shuffle_channel_onednn_detect_pass_tester.cc
+++ b/paddle/fluid/framework/ir/onednn/shuffle_channel_onednn_detect_pass_tester.cc
@@ -19,9 +19,7 @@
 #include "paddle/fluid/framework/ir/onednn/shuffle_channel_onednn_detect_pass.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 void AddVarToScope(Scope* param_scope,
                    const std::string& name,
@@ -78,8 +76,6 @@ TEST(ShuffleChannelOneDNNDetectPass, ShuffleChannelOneDNNDetectPassTest) {
   MainTest();
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 USE_PASS(shuffle_channel_onednn_detect_pass);
diff --git a/paddle/fluid/framework/ir/onednn/softplus_activation_onednn_fuse_pass.cc b/paddle/fluid/framework/ir/onednn/softplus_activation_onednn_fuse_pass.cc
index d18765ff27bdd..3d6821d5bd79b 100644
--- a/paddle/fluid/framework/ir/onednn/softplus_activation_onednn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/softplus_activation_onednn_fuse_pass.cc
@@ -21,9 +21,7 @@
 #include "paddle/phi/core/enforce.h"
 #include "paddle/utils/string/pretty_log.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 using string::PrettyLogDetail;
 
@@ -76,9 +74,7 @@ void SoftplusActivationOneDNNPass::FuseSoftplusActivation(
                     found_softplus_activation_count,
                     act_type);
 }
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(softplus_activation_onednn_fuse_pass,
               paddle::framework::ir::SoftplusActivationOneDNNPass);
diff --git a/paddle/fluid/framework/ir/onednn/squeeze2_transpose2_onednn_fuse_pass.cc b/paddle/fluid/framework/ir/onednn/squeeze2_transpose2_onednn_fuse_pass.cc
index 4af9c6a770436..7ac8edbb6005c 100644
--- a/paddle/fluid/framework/ir/onednn/squeeze2_transpose2_onednn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/squeeze2_transpose2_onednn_fuse_pass.cc
@@ -17,9 +17,7 @@
 #include "paddle/phi/backends/onednn/onednn_reuse.h"
 #include "paddle/utils/string/pretty_log.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 using string::PrettyLogDetail;
 
@@ -77,9 +75,7 @@ void FuseSqueeze2Transpose2OneDNNPass::ApplyImpl(Graph *graph) const {
   }
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(squeeze2_transpose2_onednn_fuse_pass,
               paddle::framework::ir::FuseSqueeze2Transpose2OneDNNPass);
diff --git a/paddle/fluid/framework/ir/pass_test_util.cc b/paddle/fluid/framework/ir/pass_test_util.cc
index ee75794d7ccc4..6007bfc64929b 100644
--- a/paddle/fluid/framework/ir/pass_test_util.cc
+++ b/paddle/fluid/framework/ir/pass_test_util.cc
@@ -28,10 +28,7 @@
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
-namespace test {
+namespace paddle::framework::ir::test {
 
 OpDesc* CreateOp(ProgramDesc* prog,
                  const std::string& op_type_name,
@@ -236,7 +233,4 @@ OpDesc* GetOp(const BlockDesc& block_desc,
   return nullptr;
 }
 
-}  // namespace test
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir::test
diff --git a/paddle/fluid/framework/ir/placement_pass_base.cc b/paddle/fluid/framework/ir/placement_pass_base.cc
index ccf2bf22ab57b..718e15b01fd72 100644
--- a/paddle/fluid/framework/ir/placement_pass_base.cc
+++ b/paddle/fluid/framework/ir/placement_pass_base.cc
@@ -18,9 +18,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/operator.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 void PlacementPassBase::ApplyImpl(ir::Graph* graph) const {
   VLOG(3) << "Applies " << GetPlacementName() << " placement strategy.";
@@ -43,6 +41,4 @@ void PlacementPassBase::ApplyImpl(ir::Graph* graph) const {
   }
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
diff --git a/paddle/fluid/framework/ir/preln_elementwise_groupnorm_act_pass.cc b/paddle/fluid/framework/ir/preln_elementwise_groupnorm_act_pass.cc
index 7cbb5c169f63c..3917423754ba4 100644
--- a/paddle/fluid/framework/ir/preln_elementwise_groupnorm_act_pass.cc
+++ b/paddle/fluid/framework/ir/preln_elementwise_groupnorm_act_pass.cc
@@ -18,18 +18,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 class Node;
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
-namespace paddle {
-namespace framework {
-namespace ir {
-namespace patterns {
+namespace paddle::framework::ir::patterns {
 
 struct PrelnGroupNormAct : public PatternBase {
   PrelnGroupNormAct(PDPattern *pattern, const std::string &name_scope)
@@ -92,7 +85,8 @@ void PrelnGroupNormAct::operator()(PDNode *x, PDNode *y, bool with_act) {
   }
 }
 
-}  // namespace patterns
+}  // namespace paddle::framework::ir::patterns
+namespace paddle::framework::ir {
 
 int PrelnGroupNormActFusePass::ApplyAddGNPattern(ir::Graph *graph,
                                                  bool with_act) const {
@@ -203,9 +197,7 @@ void PrelnGroupNormActFusePass::ApplyImpl(ir::Graph *graph) const {
   AddStatis(found_subgraph_count);
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(preln_elementwise_groupnorm_act_pass,
               paddle::framework::ir::PrelnGroupNormActFusePass);
diff --git a/paddle/fluid/framework/ir/preln_embedding_eltwise_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/preln_embedding_eltwise_layernorm_fuse_pass.cc
index 1734e7d675755..4d7a4b6d8406a 100644
--- a/paddle/fluid/framework/ir/preln_embedding_eltwise_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/preln_embedding_eltwise_layernorm_fuse_pass.cc
@@ -18,18 +18,11 @@
 
 #include "paddle/fluid/framework/op_version_registry.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 class Node;
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
-namespace paddle {
-namespace framework {
-namespace ir {
-namespace patterns {
+namespace paddle::framework::ir::patterns {
 
 static PDNode* create_emb_vars(PDPattern* pattern,
                                const std::string& name,
@@ -147,7 +140,8 @@ void PrelnSkipLayerNorm::operator()() {
       .LinksTo({layer_norm_out, layer_norm_mean_var, layer_norm_variance_var});
 }
 
-}  // namespace patterns
+}  // namespace paddle::framework::ir::patterns
+namespace paddle::framework::ir {
 
 int PrelnEmbeddingEltwiseLayerNormFusePass::BuildFusion(
     Graph* graph, const std::string& name_scope
@@ -455,9 +449,7 @@ void PrelnEmbeddingEltwiseLayerNormFusePass::ApplyImpl(Graph* graph) const {
   AddStatis(fusion_count);
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(preln_embedding_eltwise_layernorm_fuse_pass,
               paddle::framework::ir::PrelnEmbeddingEltwiseLayerNormFusePass);
diff --git a/paddle/fluid/framework/ir/preln_residual_bias_fuse_pass.cc b/paddle/fluid/framework/ir/preln_residual_bias_fuse_pass.cc
index 48baf1f4b102f..efe7321874b8f 100644
--- a/paddle/fluid/framework/ir/preln_residual_bias_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/preln_residual_bias_fuse_pass.cc
@@ -19,18 +19,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 class Node;
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
-namespace paddle {
-namespace framework {
-namespace ir {
-namespace patterns {
+namespace paddle::framework::ir::patterns {
 
 struct PrelnResidualBias : public PatternBase {
   PrelnResidualBias(PDPattern *pattern,
@@ -127,7 +120,8 @@ void PrelnResidualBias::operator()(PDNode *x, PDNode *y) {
           {layer_norm_out_var, layer_norm_mean_var, layer_norm_variance_var});
 }
 
-}  // namespace patterns
+}  // namespace paddle::framework::ir::patterns
+namespace paddle::framework::ir {
 
 void setIntermediateOut(OpDesc *desc,
                         const std::string &out_name,
@@ -300,9 +294,7 @@ void PrelnResidualBiasFusePass::ApplyImpl(ir::Graph *graph) const {
   AddStatis(found_subgraph_count);
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(preln_residual_bias_fuse_pass,
               paddle::framework::ir::PrelnResidualBiasFusePass);
diff --git a/paddle/fluid/framework/ir/preln_skip_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/preln_skip_layernorm_fuse_pass.cc
index 8547fceed0773..1b7ed93e8bfcb 100644
--- a/paddle/fluid/framework/ir/preln_skip_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/preln_skip_layernorm_fuse_pass.cc
@@ -19,18 +19,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 class Node;
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
-namespace paddle {
-namespace framework {
-namespace ir {
-namespace patterns {
+namespace paddle::framework::ir::patterns {
 
 struct PrelnSkipLayerNorm : public PatternBase {
   PrelnSkipLayerNorm(PDPattern *pattern, const std::string &name_scope)
@@ -102,7 +95,8 @@ void PrelnSkipLayerNorm::operator()(PDNode *x, PDNode *y) {
           {layer_norm_out_var, layer_norm_mean_var, layer_norm_variance_var});
 }
 
-}  // namespace patterns
+}  // namespace paddle::framework::ir::patterns
+namespace paddle::framework::ir {
 
 void PrelnSkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
   PADDLE_ENFORCE_NOT_NULL(
@@ -220,9 +214,7 @@ void PrelnSkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
   AddStatis(found_subgraph_count);
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(preln_skip_layernorm_fuse_pass,
               paddle::framework::ir::PrelnSkipLayerNormFusePass);
diff --git a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
index 519be82a0025f..63cbe6218ead7 100644
--- a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
@@ -18,9 +18,7 @@
 
 #include "paddle/fluid/framework/op_version_registry.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 QuantDequantFusePass::QuantDequantFusePass() {
   AddOpCompat(OpCompat("fake_quantize_range_abs_max"))
       .AddInput("X")
@@ -625,9 +623,7 @@ void QuantDequantFusePass::ApplyImpl(ir::Graph* graph) const {
   }
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(quant_conv2d_dequant_fuse_pass,
               paddle::framework::ir::QuantDequantFusePass);
diff --git a/paddle/fluid/framework/ir/quantize_helper.cc b/paddle/fluid/framework/ir/quantize_helper.cc
index c4b06651f1bbb..b424212c0bdb2 100644
--- a/paddle/fluid/framework/ir/quantize_helper.cc
+++ b/paddle/fluid/framework/ir/quantize_helper.cc
@@ -14,9 +14,7 @@
 
 #include "paddle/fluid/framework/ir/quantize_helper.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 void SaveQuantInfoInTheGraph(
     ir::Graph* graph,
@@ -74,6 +72,4 @@ std::vector<float> GetScaleVecValueForNode(
   return var_quant_scales->at(node->Name());
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc
index d4e8a1683ed18..cb5f23d7d39be 100644
--- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc
@@ -17,9 +17,7 @@
 #include "paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 void SetOp(ProgramDesc* prog,
            const std::string& type,
@@ -213,8 +211,6 @@ TEST(SeqPoolConcatFusePass, more_inputs) {
   }
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 USE_PASS(seqpool_concat_fuse_pass);
diff --git a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.cc b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.cc
index eeef9c73db3d7..2e0810571ebdf 100644
--- a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.cc
@@ -18,9 +18,7 @@
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 class Graph;
 class Node;
@@ -197,9 +195,7 @@ void SeqPoolCVMConcatFusePass::ApplyImpl(ir::Graph* graph) const {
   AddStatis(count);
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(seqpool_cvm_concat_fuse_pass,
               paddle::framework::ir::SeqPoolCVMConcatFusePass);
diff --git a/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc b/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
index 7626c1e9142f9..c0e31259f7771 100644
--- a/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
+++ b/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
@@ -18,9 +18,7 @@
 
 #include "paddle/fluid/framework/op_version_registry.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 #define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern);
 #define GET_NODES             \
@@ -230,9 +228,7 @@ void ShuffleChannelDetectPass::ApplyImpl(ir::Graph* graph) const {
   gpd(graph, handler);
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(shuffle_channel_detect_pass,
               paddle::framework::ir::ShuffleChannelDetectPass);
diff --git a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.cc b/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.cc
index 93a1008838558..1879150235e5c 100644
--- a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.cc
+++ b/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.cc
@@ -20,9 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 /*
  * This pass is to simplify the Graph, it may contains:
@@ -237,9 +235,7 @@ void SimplifyWithBasicOpsPass::ReplaceOutputVar(Node* op,
   }
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(simplify_with_basic_ops_pass,
               paddle::framework::ir::SimplifyWithBasicOpsPass);
diff --git a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
index f29df2961d995..5afc03db69b3b 100644
--- a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
@@ -19,18 +19,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 class Node;
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
-namespace paddle {
-namespace framework {
-namespace ir {
-namespace patterns {
+namespace paddle::framework::ir::patterns {
 
 struct SkipLayerNorm : public PatternBase {
   SkipLayerNorm(PDPattern *pattern, const std::string &name_scope)
@@ -99,7 +92,8 @@ PDNode *SkipLayerNorm::operator()(PDNode *x, PDNode *y) {
   return layer_norm_out_var;
 }
 
-}  // namespace patterns
+}  // namespace paddle::framework::ir::patterns
+namespace paddle::framework::ir {
 
 void SkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
   PADDLE_ENFORCE_NOT_NULL(
@@ -194,9 +188,7 @@ void SkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
   AddStatis(found_subgraph_count);
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(skip_layernorm_fuse_pass,
               paddle::framework::ir::SkipLayerNormFusePass);
diff --git a/paddle/fluid/framework/ir/split_layernorm_to_math_ops_pass.cc b/paddle/fluid/framework/ir/split_layernorm_to_math_ops_pass.cc
index 9097eb6572521..d5461037435e5 100644
--- a/paddle/fluid/framework/ir/split_layernorm_to_math_ops_pass.cc
+++ b/paddle/fluid/framework/ir/split_layernorm_to_math_ops_pass.cc
@@ -24,9 +24,7 @@
 #include "paddle/utils/string/pretty_log.h"
 #include "paddle/utils/string/printf.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 // cpplint complaints (wrong!) for not included <string> header in below line.
 using string::PrettyLogDetail;  // NOLINT
@@ -440,9 +438,7 @@ void SplitLayerNormPass::ApplyImpl(Graph* graph) const {
   AddStatis(found_layer_norm_count);
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(split_layernorm_to_math_ops_pass,
               paddle::framework::ir::SplitLayerNormPass);
diff --git a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc
index b300dcd76119c..c2a73a3aac512 100644
--- a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc
@@ -18,9 +18,7 @@
 
 #include "paddle/fluid/framework/op_version_registry.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,
                                   const std::string& name_scope) {
@@ -489,9 +487,7 @@ void SquaredMatSubFusePass::ApplyImpl(ir::Graph* graph) const {
   AddStatis(fusion_count);
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(squared_mat_sub_fuse_pass,
               paddle::framework::ir::SquaredMatSubFusePass);
diff --git a/paddle/fluid/framework/ir/subgraph_detector.cc b/paddle/fluid/framework/ir/subgraph_detector.cc
index 79df75bd780d5..82e4000179bcc 100644
--- a/paddle/fluid/framework/ir/subgraph_detector.cc
+++ b/paddle/fluid/framework/ir/subgraph_detector.cc
@@ -16,9 +16,7 @@ limitations under the License. */
 
 #include "glog/logging.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 class Graph;
 class Node;
@@ -485,6 +483,4 @@ inline bool CheckNodeIndegreeEquals(const Node &node, size_t n) {
   return node.inputs.size() == n;
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
diff --git a/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc b/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc
index 4a443dc70860c..eafdcfdc75803 100644
--- a/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc
+++ b/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc
@@ -20,9 +20,7 @@
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/platform/enforce.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 void SetOp(ProgramDesc* prog,
            const std::string& type,
@@ -92,8 +90,6 @@ TEST(IsTestPass, basic) {
   }
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 USE_PASS(sync_batch_norm_pass);
diff --git a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
index 6774a6baae023..338bcf5c50e11 100644
--- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
@@ -16,9 +16,7 @@
 
 #include "paddle/fluid/framework/op_version_registry.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 TransposeFlattenConcatFusePass::TransposeFlattenConcatFusePass() {
   AddOpCompat(OpCompat("transpose2"))
@@ -215,9 +213,7 @@ void TransposeFlattenConcatFusePass::ApplyImpl(ir::Graph *graph) const {
   }
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(transpose_flatten_concat_fuse_pass,
               paddle::framework::ir::TransposeFlattenConcatFusePass);
diff --git a/paddle/fluid/framework/ir/trt_delete_weight_dequant_linear_op_pass.cc b/paddle/fluid/framework/ir/trt_delete_weight_dequant_linear_op_pass.cc
index 6bc9cb324d80d..673f1d3bfb83d 100644
--- a/paddle/fluid/framework/ir/trt_delete_weight_dequant_linear_op_pass.cc
+++ b/paddle/fluid/framework/ir/trt_delete_weight_dequant_linear_op_pass.cc
@@ -20,9 +20,7 @@
 #include <unordered_set>
 #include <vector>
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 #define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern);
 #define GET_NODES                                 \
@@ -373,9 +371,7 @@ void TrtDeleteWeightQuantDequantLinearOpPass::ApplyImpl(
   AddStatis(found_count);
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(trt_delete_weight_dequant_linear_op_pass,
               paddle::framework::ir::TrtDeleteWeightQuantDequantLinearOpPass);
diff --git a/paddle/fluid/framework/ir/trt_map_ops_to_matrix_multiply_pass.cc b/paddle/fluid/framework/ir/trt_map_ops_to_matrix_multiply_pass.cc
index 0ca812bad7662..d0062ec632b16 100644
--- a/paddle/fluid/framework/ir/trt_map_ops_to_matrix_multiply_pass.cc
+++ b/paddle/fluid/framework/ir/trt_map_ops_to_matrix_multiply_pass.cc
@@ -22,9 +22,7 @@
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 class Node;
 
@@ -118,9 +116,7 @@ void TrtMapOpsToMatrixMultiplyPass::ApplyImpl(ir::Graph* graph) const {
   AddStatis(found_count);
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(trt_map_ops_to_matrix_multiply_pass,
               paddle::framework::ir::TrtMapOpsToMatrixMultiplyPass);
diff --git a/paddle/fluid/framework/ir/trt_multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/trt_multihead_matmul_fuse_pass.cc
index 0bee108064d08..c8d49cdfaedca 100644
--- a/paddle/fluid/framework/ir/trt_multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/trt_multihead_matmul_fuse_pass.cc
@@ -19,16 +19,11 @@
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 class Scope;
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
 
-namespace paddle {
-namespace framework {
-namespace ir {
-namespace patterns {
+namespace paddle::framework::ir::patterns {
 
 static void ReplaceOutputVar(Node* op, Node* old_var, Node* new_var) {
   if (op->IsOp() && op->Op()) {
@@ -628,7 +623,8 @@ PDNode* TrtMultiHeadMatmulV3Pattern::operator()() {
   return transpose2_2_out_var;
 }
 
-}  // namespace patterns
+}  // namespace paddle::framework::ir::patterns
+namespace paddle::framework::ir {
 
 void TrtMultiHeadMatmulFusePass::ApplyImpl(Graph* graph) const {
   FusePassBase::Init(name_scope_, graph);
@@ -1541,9 +1537,7 @@ void TrtMultiHeadMatmulV3FusePass::ApplyImpl(Graph* graph) const {
   AddStatis(fusion_count);
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(trt_multihead_matmul_fuse_pass,
               paddle::framework::ir::TrtMultiHeadMatmulFusePass);
diff --git a/paddle/fluid/framework/ir/trt_prompt_tuning_embedding_eltwise_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/trt_prompt_tuning_embedding_eltwise_layernorm_fuse_pass.cc
index f93a42a7dbab8..6772612134783 100644
--- a/paddle/fluid/framework/ir/trt_prompt_tuning_embedding_eltwise_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/trt_prompt_tuning_embedding_eltwise_layernorm_fuse_pass.cc
@@ -18,18 +18,11 @@
 
 #include "paddle/fluid/framework/op_version_registry.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 class Node;
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
-namespace paddle {
-namespace framework {
-namespace ir {
-namespace patterns {
+namespace paddle::framework::ir::patterns {
 
 static PDNode* create_emb_vars(PDPattern* pattern,
                                const std::string& name,
@@ -201,7 +194,8 @@ void TrtPromptTuningSkipLayerNorm::operator()() {
       .LinksTo({layer_norm_out});
 }
 
-}  // namespace patterns
+}  // namespace paddle::framework::ir::patterns
+namespace paddle::framework::ir {
 
 int TrtPromptTuningEmbeddingEltwiseLayerNormFusePass::BuildFusion(
     Graph* graph, const std::string& name_scope
@@ -580,9 +574,7 @@ void TrtPromptTuningEmbeddingEltwiseLayerNormFusePass::ApplyImpl(
   AddStatis(fusion_count);
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(
     trt_prompt_tuning_embedding_eltwise_layernorm_fuse_pass,
diff --git a/paddle/fluid/framework/ir/trt_qk_multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/trt_qk_multihead_matmul_fuse_pass.cc
index d68694106b5c7..c6a22c143fb66 100644
--- a/paddle/fluid/framework/ir/trt_qk_multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/trt_qk_multihead_matmul_fuse_pass.cc
@@ -22,10 +22,7 @@
 #endif
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
-namespace patterns {
+namespace paddle::framework::ir::patterns {
 
 //       input_qk   input_v
 //       |q     |k      v
@@ -249,7 +246,8 @@ PDNode* TrtQKMultiHeadMatmulPattern::operator()() {
   return reshape2_qkv_out_var;
 }
 
-}  // namespace patterns
+}  // namespace paddle::framework::ir::patterns
+namespace paddle::framework::ir {
 
 int TrtQkMultiHeadMatmulFusePass::BuildQkFusion(Graph* graph,
                                                 const std::string& name_scope,
@@ -575,9 +573,7 @@ void TrtQkMultiHeadMatmulFusePass::ApplyImpl(Graph* graph) const {
   AddStatis(fusion_count);
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(trt_qk_multihead_matmul_fuse_pass,
               paddle::framework::ir::TrtQkMultiHeadMatmulFusePass);
diff --git a/paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.cc
index 0708218dbd07c..e90cadc782a61 100644
--- a/paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.cc
@@ -22,18 +22,11 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #endif
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 class Node;
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
-namespace paddle {
-namespace framework {
-namespace ir {
-namespace patterns {
+namespace paddle::framework::ir::patterns {
 
 struct TrtSkipLayerNorm : public PatternBase {
   TrtSkipLayerNorm(PDPattern *pattern, const std::string &name_scope)
@@ -102,7 +95,8 @@ PDNode *TrtSkipLayerNorm::operator()(PDNode *x, PDNode *y) {
   return layer_norm_out_var;
 }
 
-}  // namespace patterns
+}  // namespace paddle::framework::ir::patterns
+namespace paddle::framework::ir {
 
 void TrtSkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
   PADDLE_ENFORCE_NOT_NULL(
@@ -271,9 +265,7 @@ void TrtSkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
   AddStatis(found_subgraph_count);
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(trt_skip_layernorm_fuse_pass,
               paddle::framework::ir::TrtSkipLayerNormFusePass);
diff --git a/paddle/fluid/framework/ir/trt_support_nhwc_pass.cc b/paddle/fluid/framework/ir/trt_support_nhwc_pass.cc
index d9907555a17b5..6b49a99c02364 100644
--- a/paddle/fluid/framework/ir/trt_support_nhwc_pass.cc
+++ b/paddle/fluid/framework/ir/trt_support_nhwc_pass.cc
@@ -26,9 +26,7 @@
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/phi/common/data_type.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 namespace {
 
@@ -383,8 +381,6 @@ void TrtSupportNHWCPass::ApplyImpl(Graph *graph) const {
   AddStatis(transposed_ops.size());
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(trt_support_nhwc_pass, paddle::framework::ir::TrtSupportNHWCPass);
diff --git a/paddle/fluid/framework/ir/vit_attention_fuse_pass.cc b/paddle/fluid/framework/ir/vit_attention_fuse_pass.cc
index 382e1c60ee989..a547301b835c9 100644
--- a/paddle/fluid/framework/ir/vit_attention_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/vit_attention_fuse_pass.cc
@@ -52,9 +52,7 @@
   GET_IR_NODE(reshape2_op);       \
   GET_IR_NODE(reshape2_out);
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 bool HasScale(OpDesc* const op_ptr,
               std::string* name,
@@ -160,9 +158,7 @@ void VitAttentionFusePass::ApplyImpl(ir::Graph* graph) const {
   AddStatis(fusion_count);
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(vit_attention_fuse_pass,
               paddle::framework::ir::VitAttentionFusePass);
diff --git a/paddle/fluid/framework/ir/xpu/block_multihead_attention_xpu_pass.cc b/paddle/fluid/framework/ir/xpu/block_multihead_attention_xpu_pass.cc
new file mode 100644
index 0000000000000..3d4c78896f7e2
--- /dev/null
+++ b/paddle/fluid/framework/ir/xpu/block_multihead_attention_xpu_pass.cc
@@ -0,0 +1,125 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+
+#include "glog/logging.h"
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/ir/xpu/pass_utils.h"
+#include "paddle/fluid/framework/ir/xpu/quant_utils.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace phi {
+class DenseTensor;
+}  // namespace phi
+
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class BlockMultiHeadAttentionXPUPass : public FusePassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+
+ private:
+  void InplaceBlockMultiHeadAttentionXPU(ir::Graph* graph) const;
+
+  const std::string name_scope_{"block_multihead_attention_xpu_pass"};
+};
+
+void BlockMultiHeadAttentionXPUPass::ApplyImpl(ir::Graph* graph) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::PreconditionNotMet("graph should not be null."));
+  Init(name_scope_, graph);
+
+  InplaceBlockMultiHeadAttentionXPU(graph);
+}
+
+void BlockMultiHeadAttentionXPUPass::InplaceBlockMultiHeadAttentionXPU(
+    ir::Graph* graph) const {
+  const int64_t max_batch_size = 10;
+  auto* scope = param_scope();
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp() && node->Op()->Type() == "block_multihead_attention") {
+      auto* op_desc = node->Op();
+      op_desc->SetType("block_multihead_attention_xpu");
+      phi::DenseTensor cache_k_per_batch_maxs;
+      auto base_name = op_desc->Input("qkv")[0];
+      int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1);
+      std::string cache_k_per_batch_maxs_name = base_name + "_max_cache_k";
+      VarDesc cache_k_per_batch_maxs_desc(cache_k_per_batch_maxs_name);
+      cache_k_per_batch_maxs_desc.SetPersistable(true);
+      cache_k_per_batch_maxs_desc.SetShape(
+          {max_batch_size, static_cast<int64_t>(max_ptr_size)});
+      cache_k_per_batch_maxs_desc.SetDataType(
+          proto::VarType::Type::VarType_Type_FP32);
+      Node* cache_k_per_batch_maxs_in =
+          graph->CreateVarNode(&cache_k_per_batch_maxs_desc);
+      phi::DenseTensor cpu_tensor;
+      auto* cpu_ctx = static_cast<phi::CPUContext*>(
+          platform::DeviceContextPool::Instance().Get(phi::CPUPlace()));
+      cpu_tensor.set_type(phi::DataType::FLOAT32);
+      cpu_tensor.Resize({max_batch_size, max_ptr_size});
+      std::vector<float> tmp(max_batch_size * max_ptr_size, 0);
+      memcpy(cpu_ctx->Alloc<float>(&cpu_tensor),
+             tmp.data(),
+             max_batch_size * max_ptr_size * sizeof(float));
+      Assign(cpu_tensor,
+             scope->Var(cache_k_per_batch_maxs_name)
+                 ->GetMutable<phi::DenseTensor>());
+      op_desc->SetInput("cache_k_per_batch_maxs",
+                        {cache_k_per_batch_maxs_name});
+
+      std::string cache_v_per_batch_maxs_name = base_name + "_max_cache_v";
+      VarDesc cache_v_per_batch_maxs_desc(cache_v_per_batch_maxs_name);
+      cache_v_per_batch_maxs_desc.SetPersistable(true);
+      cache_v_per_batch_maxs_desc.SetShape(
+          {max_batch_size, static_cast<int64_t>(max_ptr_size)});
+      cache_v_per_batch_maxs_desc.SetDataType(
+          proto::VarType::Type::VarType_Type_FP32);
+      Node* cache_v_per_batch_maxs_in =
+          graph->CreateVarNode(&cache_v_per_batch_maxs_desc);
+      Assign(cpu_tensor,
+             scope->Var(cache_v_per_batch_maxs_name)
+                 ->GetMutable<phi::DenseTensor>());
+      op_desc->SetInput("cache_v_per_batch_maxs",
+                        {cache_v_per_batch_maxs_name});
+
+      IR_NODE_LINK_TO(cache_k_per_batch_maxs_in, node);
+      IR_NODE_LINK_TO(cache_v_per_batch_maxs_in, node);
+    }
+  }
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(block_multihead_attention_xpu_pass,
+              paddle::framework::ir::BlockMultiHeadAttentionXPUPass);
+
+REGISTER_PASS_CAPABILITY(block_multihead_attention_xpu_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination().EQ(
+            "block_multihead_attention_xpu", 0));
diff --git a/paddle/fluid/framework/ir/yolo_box_fuse_pass.cc b/paddle/fluid/framework/ir/yolo_box_fuse_pass.cc
index 7e2ba4dcabee2..9399bf743d6e3 100644
--- a/paddle/fluid/framework/ir/yolo_box_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/yolo_box_fuse_pass.cc
@@ -22,13 +22,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 class Node;
 
-namespace patterns {
+}  // namespace paddle::framework::ir
+namespace paddle::framework::ir::patterns {
 struct YoloBoxPattern : public PatternBase {
   YoloBoxPattern(PDPattern* pattern, const std::string& name_scope)
       : PatternBase(pattern, name_scope, name_scope) {
@@ -147,7 +146,8 @@ struct YoloBoxPattern : public PatternBase {
   PATTERN_DECL_NODE(nms_out_index);
   PATTERN_DECL_NODE(nms_out_rois_num);
 };
-}  // namespace patterns
+}  // namespace paddle::framework::ir::patterns
+namespace paddle::framework::ir {
 
 YoloBoxFusePass::YoloBoxFusePass() = default;
 
@@ -300,8 +300,6 @@ void YoloBoxFusePass::ApplyImpl(ir::Graph* graph) const {
   AddStatis(found_subgraph_count);
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(yolo_box_fuse_pass, paddle::framework::ir::YoloBoxFusePass);
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index 6b257385c9b06..8eba8bb026430 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -19,8 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/version.h"
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 
 std::string LoDToString(const LoD &lod) {
   std::ostringstream stream;
@@ -520,5 +519,4 @@ void MergeLoDTensor(phi::DenseTensor *target,
   }
 }
 
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index 095e0401fcad5..d0da7e7e1817d 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -33,8 +33,7 @@
 #include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h"
 #endif
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 void NaiveExecutor::Prepare(Scope *scope,
                             const ProgramDesc &program_desc,
                             int block_id) {
@@ -331,5 +330,4 @@ void NaiveExecutor::ResetTrtOps(int num) {
 #endif
 }
 
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
diff --git a/paddle/fluid/framework/new_executor/executor_statistics.cc b/paddle/fluid/framework/new_executor/executor_statistics.cc
index be1f8bbfcf8a6..4b830e7b05e55 100644
--- a/paddle/fluid/framework/new_executor/executor_statistics.cc
+++ b/paddle/fluid/framework/new_executor/executor_statistics.cc
@@ -35,8 +35,7 @@ PADDLE_DEFINE_EXPORTED_string(static_executor_perfstat_filepath,
                               "enables performance statistics for the static "
                               "graph executor.");
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 
 class StatisticsEngine {
  public:
@@ -632,5 +631,4 @@ void StaticGraphExecutorPerfStatistics(
   }
 }
 
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
diff --git a/paddle/fluid/framework/new_executor/feed_fetch_utils.cc b/paddle/fluid/framework/new_executor/feed_fetch_utils.cc
index f82350ec6d103..3d328372514d7 100644
--- a/paddle/fluid/framework/new_executor/feed_fetch_utils.cc
+++ b/paddle/fluid/framework/new_executor/feed_fetch_utils.cc
@@ -19,8 +19,7 @@
 #include "paddle/fluid/framework/new_executor/feed_fetch_utils.h"
 #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 
 void SetColAttrForFeedFetchOps(std::shared_ptr<ProgramDesc> program_desc,
                                const int64_t micro_batch_num,
@@ -253,5 +252,4 @@ void MergeTensors(const std::vector<const phi::DenseTensor*>& tensors,
   }
 }
 
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
diff --git a/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.cc b/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.cc
index a3dd897ff52c6..17bfc8bc30853 100644
--- a/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.cc
+++ b/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.cc
@@ -21,8 +21,7 @@
 #include <windows.h>
 #endif  // !_WIN32
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 
 InterpreterCoreEventGarbageCollector::InterpreterCoreEventGarbageCollector(
     const std::vector<Instruction>& vec_instruction)
@@ -214,5 +213,4 @@ void InterpreterCoreEventGarbageCollector::FreeGarbages() {
   events_.clear();
 }
 
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
diff --git a/paddle/fluid/framework/new_executor/instruction/custom_kernel_instruction.cc b/paddle/fluid/framework/new_executor/instruction/custom_kernel_instruction.cc
index 0da7138d24b9d..3c4eb57b6bee0 100644
--- a/paddle/fluid/framework/new_executor/instruction/custom_kernel_instruction.cc
+++ b/paddle/fluid/framework/new_executor/instruction/custom_kernel_instruction.cc
@@ -22,8 +22,7 @@
 #include "paddle/pir/include/core/operation.h"
 #include "paddle/pir/include/core/value.h"
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 
 void CustomKernelInstruction::BuildCustomContext(
     const paddle::dialect::OpYamlInfoParser& op_yaml_info) {
@@ -509,5 +508,4 @@ void CustomKernelInstruction::Run() {
   VLOG(6) << "Run custom op " << custom_op_name_ << " kernel.";
   kernel_func_(&custom_kernel_ctx_);
 }
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_util.cc b/paddle/fluid/framework/new_executor/instruction/instruction_util.cc
index 609fd78106747..e7a05d75f6e99 100644
--- a/paddle/fluid/framework/new_executor/instruction/instruction_util.cc
+++ b/paddle/fluid/framework/new_executor/instruction/instruction_util.cc
@@ -42,8 +42,7 @@
 COMMON_DECLARE_bool(dynamic_static_unified_comm);
 #endif
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 
 std::vector<int> GetValueIds(pir::Value value,
                              const ValueExecutionInfo& value_exec_info) {
@@ -407,5 +406,4 @@ bool GetCondData(const phi::DenseTensor& cond) {
   return cpu_cond->data<bool>()[0];
 }
 
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
diff --git a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc
index 56ebe4673caa1..2f723c8ed686a 100644
--- a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc
+++ b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc
@@ -29,8 +29,7 @@
 #include "paddle/phi/core/meta_tensor.h"
 #include "paddle/phi/core/type_defs.h"
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 
 LegacyKernelInstruction::LegacyKernelInstruction(
     size_t id,
@@ -189,5 +188,4 @@ void LegacyKernelInstruction::Run() {
   VLOG(6) << "Run op " << legacy_op_name_ << " kernel.";
   (*(phi_kernel_))((kernel_context_));
 }
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
diff --git a/paddle/fluid/framework/new_executor/instruction/onednn/onednn_instruction.cc b/paddle/fluid/framework/new_executor/instruction/onednn/onednn_instruction.cc
index 9af41b9e8c08b..b8a56321b9e66 100644
--- a/paddle/fluid/framework/new_executor/instruction/onednn/onednn_instruction.cc
+++ b/paddle/fluid/framework/new_executor/instruction/onednn/onednn_instruction.cc
@@ -405,9 +405,12 @@ OneDNNPhiKernelInstruction::~OneDNNPhiKernelInstruction() {
 }
 
 void OneDNNPhiKernelInstruction::Run() {
+  std::vector<std::shared_ptr<phi::DenseTensor>> tmp_holders;
+  auto tmp_kernel_context = kernel_context_;
+  auto tmp_infer_meta_context_ = infer_meta_context_;
   // Step1. TransLayout
-  auto inputs = kernel_context_.InputsBetween<phi::DenseTensor>(
-      size_t(0), kernel_context_.InputsSize());
+  auto inputs = tmp_kernel_context.InputsBetween<phi::DenseTensor>(
+      size_t(0), tmp_kernel_context.InputsSize());
   for (size_t i = 0; i < inputs.size(); ++i) {
     auto input = inputs[i];
     if (input == nullptr) {
@@ -419,10 +422,12 @@ void OneDNNPhiKernelInstruction::Run() {
     if (skip_format_tensors_.count(i)) {
       continue;
     }
-    VLOG(6) << "input[" << i << "].layout() = " << input->layout();
+    VLOG(6) << "input[" << i << "].layout() = " << input->layout()
+            << ", shape = " << input->dims();
     if (input->layout() != phi::DataLayout::ONEDNN) {
       phi::DataLayout from_layout = input->layout();
-      auto transed_tensor = const_cast<phi::DenseTensor*>(input);
+      tmp_holders.emplace_back(std::make_shared<phi::DenseTensor>(*input));
+      auto transed_tensor = tmp_holders.back().get();
 
       std::set<std::string> elementwise_kernels = {
           "add", "subtract", "multiply", "divide"};
@@ -461,8 +466,24 @@ void OneDNNPhiKernelInstruction::Run() {
       }
 
       dnnl::memory::desc out_mem_desc =
-          phi::funcs::make_memory_desc(*input, from_layout);
+          phi::funcs::make_memory_desc(*transed_tensor, from_layout);
       transed_tensor->set_mem_desc(out_mem_desc);
+      tmp_kernel_context.UpdataInput(i, transed_tensor);
+      auto meta_tensor = phi::MetaTensor(transed_tensor);
+      auto input_meta_tensor = phi::MetaTensor(input);
+      if (tmp_infer_meta_context_.InputsSize() > i &&
+          tmp_infer_meta_context_.InputAt(i).is_same_tensor(
+              input_meta_tensor)) {
+        tmp_infer_meta_context_.UpdataInput(i, meta_tensor);
+      } else {
+        for (size_t j = 0; j < tmp_infer_meta_context_.InputsSize(); ++j) {
+          if (tmp_infer_meta_context_.InputAt(j).is_same_tensor(
+                  input_meta_tensor)) {
+            tmp_infer_meta_context_.UpdataInput(j, meta_tensor);
+            break;
+          }
+        }
+      }
     }
   }
 
@@ -470,7 +491,7 @@ void OneDNNPhiKernelInstruction::Run() {
   // SetDnnAttrIntoDeviceContext
   // SetInputsName SetOutputsName
   auto one_dnn_ctx = const_cast<phi::OneDNNContext*>(
-      &kernel_context_.GetDeviceContext<phi::OneDNNContext>());
+      &tmp_kernel_context.GetDeviceContext<phi::OneDNNContext>());
   for (auto& attr : extra_attr_) {
     one_dnn_ctx->SetDnnAttr(attr.first, attr.second);
   }
@@ -482,12 +503,12 @@ void OneDNNPhiKernelInstruction::Run() {
 
   // Step3. InferMeta
   if (infer_meta_interface_) {
-    infer_meta_interface_->infer_meta_(&(infer_meta_context_));
+    infer_meta_interface_->infer_meta_(&(tmp_infer_meta_context_));
   }
 
   // Step4. Run kernel
   VLOG(6) << "Run op " << phi_op_name_ << " infer meta.";
-  (*(phi_kernel_))(&(kernel_context_));
+  (*(phi_kernel_))(&(tmp_kernel_context));
   VLOG(6) << "Run op " << phi_op_name_ << " kernel.";
 
   // Step5. ClearDnnAttr
diff --git a/paddle/fluid/framework/new_executor/instruction/onednn/onednn_mixed_instruction.cc b/paddle/fluid/framework/new_executor/instruction/onednn/onednn_mixed_instruction.cc
index 0115f2f4b9f31..3f72973e37a3e 100644
--- a/paddle/fluid/framework/new_executor/instruction/onednn/onednn_mixed_instruction.cc
+++ b/paddle/fluid/framework/new_executor/instruction/onednn/onednn_mixed_instruction.cc
@@ -58,6 +58,7 @@ OneDNNMixedPhiKernelInstruction::OneDNNMixedPhiKernelInstruction(
 }
 
 void OneDNNMixedPhiKernelInstruction::Run() {
+  std::vector<std::shared_ptr<phi::DenseTensor>> tmp_holders;
   // Step1. Mixed Dynamic Choose Kernel
   if (!has_choose_kernel_) {
     has_choose_kernel_ = true;
@@ -76,9 +77,11 @@ void OneDNNMixedPhiKernelInstruction::Run() {
   if (use_onednn_kernel_) {
     OneDNNPhiKernelInstruction::Run();
   } else {
+    auto tmp_kernel_context = kernel_context_;
+    auto tmp_infer_meta_context_ = infer_meta_context_;
     // TransLayout first
-    auto inputs = kernel_context_.InputsBetween<phi::DenseTensor>(
-        size_t(0), kernel_context_.InputsSize());
+    auto inputs = tmp_kernel_context.InputsBetween<phi::DenseTensor>(
+        size_t(0), tmp_kernel_context.InputsSize());
 
     for (size_t i = 0; i < inputs.size(); ++i) {
       auto input = inputs[i];
@@ -89,30 +92,66 @@ void OneDNNMixedPhiKernelInstruction::Run() {
         // NOTE(zhiqiu): to handle the special case in ApplyDataTransform() in
         // data_transfer.cc
         if (!input->IsInitialized() && tmp_layout == DataLayout::NHWC) {
-          auto transed_tensor = const_cast<phi::DenseTensor*>(input);
+          tmp_holders.emplace_back(std::make_shared<phi::DenseTensor>(*input));
+          auto transed_tensor = tmp_holders.back().get();
           transed_tensor->set_layout(tmp_layout);
           phi::funcs::MatchShapeToLayout(
               transed_tensor, phi::DataLayout::ONEDNN, tmp_layout);
+          dnnl::memory::desc out_mem_desc =
+              phi::funcs::make_memory_desc(*transed_tensor, tmp_layout);
+          transed_tensor->set_mem_desc(out_mem_desc);
+          tmp_kernel_context.UpdataInput(i, transed_tensor);
+          auto meta_tensor = phi::MetaTensor(transed_tensor);
+          auto input_meta_tensor = phi::MetaTensor(input);
+          if (tmp_infer_meta_context_.InputsSize() > i &&
+              tmp_infer_meta_context_.InputAt(i).is_same_tensor(
+                  input_meta_tensor)) {
+            tmp_infer_meta_context_.UpdataInput(i, meta_tensor);
+          } else {
+            for (size_t j = 0; j < tmp_infer_meta_context_.InputsSize(); ++j) {
+              if (tmp_infer_meta_context_.InputAt(j).is_same_tensor(
+                      input_meta_tensor)) {
+                tmp_infer_meta_context_.UpdataInput(j, meta_tensor);
+                break;
+              }
+            }
+          }
         } else {
-          phi::DenseTensor transed_tensor;
-          transed_tensor.set_meta(input->meta());
+          tmp_holders.emplace_back(std::make_shared<phi::DenseTensor>());
+          auto transed_tensor = tmp_holders.back().get();
+          transed_tensor->set_meta(input->meta());
           phi::funcs::TransDataLayoutFromOneDNN(phi::DataLayout::ONEDNN,
                                                 tmp_layout,
                                                 *input,
-                                                &transed_tensor,
+                                                transed_tensor,
                                                 phi::CPUPlace());
-          *(const_cast<phi::DenseTensor*>(input)) = transed_tensor;
+          tmp_kernel_context.UpdataInput(i, transed_tensor);
+          auto meta_tensor = phi::MetaTensor(transed_tensor);
+          auto input_meta_tensor = phi::MetaTensor(input);
+          if (tmp_infer_meta_context_.InputsSize() > i &&
+              tmp_infer_meta_context_.InputAt(i).is_same_tensor(
+                  input_meta_tensor)) {
+            tmp_infer_meta_context_.UpdataInput(i, meta_tensor);
+          } else {
+            for (size_t j = 0; j < tmp_infer_meta_context_.InputsSize(); ++j) {
+              if (tmp_infer_meta_context_.InputAt(j).is_same_tensor(
+                      input_meta_tensor)) {
+                tmp_infer_meta_context_.UpdataInput(j, meta_tensor);
+                break;
+              }
+            }
+          }
         }
       }
     }
 
     VLOG(6) << "Begin run op " << phi_op_name_ << " infer meta.";
     if (infer_meta_interface_) {
-      infer_meta_interface_->infer_meta_(&(infer_meta_context_));
+      infer_meta_interface_->infer_meta_(&(tmp_infer_meta_context_));
     }
     VLOG(6) << "End run op " << phi_op_name_ << " infer meta.";
     VLOG(6) << "Begin run op " << phi_op_name_ << " kernel.";
-    (*(phi_kernel_))(&(kernel_context_));
+    (*(phi_kernel_))(&(tmp_kernel_context));
     VLOG(6) << "End run op " << phi_op_name_ << " kernel.";
   }
 }
diff --git a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc
index 11f29ba5c5a45..909dfefcfde08 100644
--- a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc
@@ -45,9 +45,7 @@ PADDLE_DEFINE_EXPORTED_bool(enable_dependency_builder_debug_info,
                             false,
                             "Enable dependency builder debug info");
 
-namespace paddle {
-namespace framework {
-namespace interpreter {
+namespace paddle::framework::interpreter {
 
 size_t CountDownstreamMap(
     const std::map<size_t, std::set<size_t>>& downstream_map) {
@@ -1482,6 +1480,4 @@ void DependencyBuilderSimplify::AddDownstreamOp(size_t prior_op_idx,
   }
 }
 
-}  // namespace interpreter
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::interpreter
diff --git a/paddle/fluid/framework/new_executor/interpreter/execution_config.cc b/paddle/fluid/framework/new_executor/interpreter/execution_config.cc
index e8bcfbc736a9e..2b6a3918ba239 100644
--- a/paddle/fluid/framework/new_executor/interpreter/execution_config.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/execution_config.cc
@@ -24,9 +24,7 @@
 
 PD_DECLARE_bool(new_executor_serial_run);
 
-namespace paddle {
-namespace framework {
-namespace interpreter {
+namespace paddle::framework::interpreter {
 
 static constexpr size_t kHostNumThreads = 4;
 static constexpr size_t kDeviceNumThreads = 1;
@@ -151,6 +149,4 @@ void ExecutionConfig::Log(int log_level) {
   VLOG(log_level) << log_str.str();
 }
 
-}  // namespace interpreter
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::interpreter
diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
index 52516d69794c8..a3c445cac3c2a 100644
--- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
@@ -53,9 +53,7 @@ COMMON_DECLARE_bool(check_nan_inf);
 COMMON_DECLARE_string(static_runtime_data_save_path);
 COMMON_DECLARE_bool(save_static_runtime_data);
 
-namespace paddle {
-namespace framework {
-namespace interpreter {
+namespace paddle::framework::interpreter {
 
 using VariableIdMap = std::map<std::string, std::vector<int>>;
 
@@ -1464,6 +1462,4 @@ const std::vector<std::string> GetInstructionCallStack(
   }
   return vec_str;
 }
-}  // namespace interpreter
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::interpreter
diff --git a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc
index 54ee746726e7e..ee28442be2f56 100644
--- a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc
@@ -28,9 +28,7 @@
 COMMON_DECLARE_bool(dynamic_static_unified_comm);
 #endif
 
-namespace paddle {
-namespace framework {
-namespace interpreter {
+namespace paddle::framework::interpreter {
 
 using DeviceContext = platform::DeviceContext;
 using DeviceEvent = platform::DeviceEvent;
@@ -431,12 +429,24 @@ void analyse_event_info_for_two_instructions<Instruction>(
 
   if (has_data_dependency<Instruction, std::string>(
           instructions[cur_instr_id], instructions[next_instr_id]) ||
-      !run_type_info[next_instr_id][DownstreamRunType::kEventRun].empty() ||
       instructions[next_instr_id]->OpBase()->Type() == "depend") {
     waiter_instr_ids->insert(next_instr_id);
     return;
   }
 
+  if (!run_type_info[next_instr_id][DownstreamRunType::kEventRun].empty()) {
+    auto& next_next_instructor_ids =
+        run_type_info[next_instr_id][DownstreamRunType::kEventRun];
+    for (auto& id : next_next_instructor_ids) {
+      if (has_data_dependency<Instruction, std::string>(
+              instructions[cur_instr_id], instructions[id])) {
+        waiter_instr_ids->insert(next_instr_id);
+        return;
+      }
+    }
+    return;
+  }
+
   // NOTE(Ruibiao): If no data dependency from cur_instr to next_instr, and
   // simultaneously next_instr has no event_run downstream instr, we try to
   // recursively add events between cur_instr and next_instr's
@@ -491,12 +501,25 @@ void analyse_event_info_for_two_instructions<
 
   if (has_data_dependency<paddle::framework::InstructionBase, pir::Value>(
           instructions[cur_instr_id], instructions[next_instr_id]) ||
-      !run_type_info[next_instr_id][DownstreamRunType::kEventRun].empty() ||
       instructions[next_instr_id]->Name() == "pd_op.depend") {
     waiter_instr_ids->insert(next_instr_id);
     return;
   }
 
+  if (!run_type_info[next_instr_id][DownstreamRunType::kEventRun].empty()) {
+    auto& next_next_instructor_ids =
+        run_type_info[next_instr_id][DownstreamRunType::kEventRun];
+    for (auto& id : next_next_instructor_ids) {
+      if (has_data_dependency<paddle::framework::InstructionBase, pir::Value>(
+              instructions[cur_instr_id], instructions[id])) {
+        waiter_instr_ids->insert(next_instr_id);
+        return;
+      }
+    }
+
+    return;
+  }
+
   // NOTE(Ruibiao): If no data dependency from cur_instr to next_instr, and
   // simultaneously next_instr has no event_run downstream instr, we try to
   // recursively add events between cur_instr and next_instr's
@@ -847,6 +870,4 @@ PirStreamAnalyzer::GetEventInfo() const {
   return event_info_;
 }
 
-}  // namespace interpreter
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::interpreter
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index 416d46c01e1f2..d5fe408d53401 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -35,8 +35,7 @@ PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope,
                             "Use local_scope in new executor(especially used "
                             "in UT), can turn off for better performance");
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 
 InterpreterCore::InterpreterCore(const platform::Place& place,
                                  const BlockDesc& block,
@@ -170,5 +169,4 @@ Variable* InterpreterCore::DebugVar(const std::string& name) const {
   return impl_->DebugVar(name);
 }
 
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.cc b/paddle/fluid/framework/new_executor/new_executor_defs.cc
index 183abe646a293..6cdbb6834d6d8 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.cc
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.cc
@@ -23,8 +23,7 @@
 #include "paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 
 VariableScope::VariableScope(Scope* scope)
     : var_list_(), name2id_(), vec_meta_info_(), data_transfer_added_vars_() {
@@ -353,5 +352,4 @@ void Instruction::UpdateRecordStreamForGcInfo() {
 }
 #endif
 
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.cc b/paddle/fluid/framework/new_executor/pir_interpreter.cc
index ddce703dab665..59bf34700359b 100644
--- a/paddle/fluid/framework/new_executor/pir_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/pir_interpreter.cc
@@ -91,8 +91,7 @@ COMMON_DECLARE_int32(low_precision_op_list);
   vec_instruction_base_.emplace_back(std::make_unique<instr_name>( \
       op_idx++, place_, &op, value_exe_info_.get()));
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 
 void RecordLowPrecisionOp(const InstructionBase* instr_node) {
   if (FLAGS_low_precision_op_list) {
@@ -2024,5 +2023,4 @@ void PirInterpreter::SetCopyProgram(std::shared_ptr<ProgramDesc> prog) {
       "SetCopyProgram is not implemented in PirInterpreter."));
 }
 
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc
index 4e4b41579f4fe..1acec52134046 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -13,14 +13,14 @@
 // limitations under the License.
 #include "paddle/fluid/framework/new_executor/standalone_executor.h"
 #include "paddle/common/flags.h"
+#include "paddle/fluid/framework/feed_hook.h"
 #include "paddle/fluid/framework/new_executor/feed_fetch_utils.h"
 #include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h"
 #include "paddle/fluid/framework/new_executor/pir_interpreter.h"
 #include "paddle/fluid/framework/new_executor/program_interpreter.h"
 #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
-#include "paddle/fluid/platform/profiler/event_tracing.h"
-
 #include "paddle/fluid/pir/transforms/pd_op_to_kernel_pass.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 
 #include "paddle/fluid/ir_adaptor/translator/translate.h"
 #include "paddle/fluid/pir/transforms/general/inplace_pass.h"
@@ -32,8 +32,7 @@ COMMON_DECLARE_bool(enable_pir_in_executor);
 COMMON_DECLARE_bool(enable_pir_api);
 COMMON_DECLARE_bool(pir_apply_inplace_pass);
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 StandaloneExecutor::StandaloneExecutor(const platform::Place& place,
                                        const interpreter::Plan& plan,
                                        Scope* scope)
@@ -66,6 +65,7 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place,
     std::shared_ptr<::pir::Program> ir_program = nullptr;
     if (FLAGS_enable_pir_api || FLAGS_enable_pir_in_executor) {  // NOLINT
       ir_program = plan_.IrProgram(job_type);
+      RunFeedHooks(*ir_program, *scope);
     } else {
       // NOTE (liuchenghao): std::make_shared will duplicate ProgramDesc object,
       // maybe std::make_unique is better?
@@ -303,5 +303,4 @@ std::shared_ptr<framework::ProgramDesc> StandaloneExecutor::RunProfile(
   return copy_desc;
 }
 
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
diff --git a/paddle/fluid/framework/new_executor/workqueue/workqueue_utils.cc b/paddle/fluid/framework/new_executor/workqueue/workqueue_utils.cc
index bdae93c001bfa..437a14ea00404 100644
--- a/paddle/fluid/framework/new_executor/workqueue/workqueue_utils.cc
+++ b/paddle/fluid/framework/new_executor/workqueue/workqueue_utils.cc
@@ -17,8 +17,7 @@
 #include <cstdint>
 #include <cstdlib>
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 
 void* AlignedMalloc(size_t size, size_t alignment) {
   assert(alignment >= sizeof(void*) && (alignment & (alignment - 1)) == 0);
@@ -56,5 +55,4 @@ void AlignedFree(void* mem_ptr) {
 #endif
 }
 
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
diff --git a/paddle/fluid/framework/op_def_api.cc b/paddle/fluid/framework/op_def_api.cc
index 1204c95dedc19..ee88ad5d161e2 100644
--- a/paddle/fluid/framework/op_def_api.cc
+++ b/paddle/fluid/framework/op_def_api.cc
@@ -41,8 +41,7 @@ namespace {
 */
 #include "paddle/fluid/framework/op_def.pbtxt"  //NOLINT
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 
 const proto::OpDef& GetOpDef(const std::string& op_name) {
   static std::unordered_map<std::string, proto::OpDef> ops_definition;
@@ -73,5 +72,4 @@ const proto::OpDef& GetOpDef(const std::string& op_name) {
 bool HasOpDef(const std::string& op_name) {
   return op_def_map.find(op_name) != op_def_map.end();
 }
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc
index 512cdd9b38769..e2d6ca02f9e6d 100644
--- a/paddle/fluid/framework/program_desc.cc
+++ b/paddle/fluid/framework/program_desc.cc
@@ -25,8 +25,7 @@ extern "C" {
 #include "paddle/fluid/framework/program_converter.h"
 #include "paddle/fluid/framework/version.h"
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 
 BlockDesc *ProgramDesc::AppendBlock(const BlockDesc &parent) {
   auto *b = desc_.add_blocks();
@@ -287,5 +286,4 @@ bool ProgramDesc::NeedUpdate() const {
   return need;
 }
 
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
diff --git a/paddle/fluid/framework/program_utils.cc b/paddle/fluid/framework/program_utils.cc
index ede7f9aa74759..8cf1c1718c122 100644
--- a/paddle/fluid/framework/program_utils.cc
+++ b/paddle/fluid/framework/program_utils.cc
@@ -17,8 +17,7 @@ limitations under the License. */
 #include <google/protobuf/text_format.h>
 #include "paddle/fluid/framework/block_desc.h"
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 
 template <typename Container, typename Visitor>
 inline void VisitAllElements(Container &&container,
@@ -214,5 +213,4 @@ void DumpProgramDescFile(const std::string &name, const ProgramDesc &program) {
   WriteToFile(filename.c_str(), print_str);
 }
 
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index 0af05cab56ac5..2c2f3bd76d0b7 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -26,8 +26,7 @@ COMMON_DECLARE_bool(eager_delete_scope);
 #define SCOPE_VARS_READER_LOCK phi::AutoRDLock auto_lock(&vars_lock_);
 #define SCOPE_VARS_WRITER_LOCK phi::AutoWRLock auto_lock(&vars_lock_);
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 Scope::Scope() : vars_(), kids_() {}
 Scope::~Scope() { DropKids(); }  // NOLINT
 
@@ -307,5 +306,4 @@ std::string GenScopeTreeDebugInfo(Scope* root) {
   return os.str();
 }
 
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc
index 19e09ab5edf8d..f5425ba13f96e 100644
--- a/paddle/fluid/framework/section_worker.cc
+++ b/paddle/fluid/framework/section_worker.cc
@@ -16,8 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/executor_gc_helper.h"
 #include "paddle/fluid/platform/device_context.h"
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 
 class TrainerDesc;
 
@@ -248,6 +247,5 @@ void SectionWorker::TrainFiles() {
   ++batch_id_;
 }
 
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
 #endif
diff --git a/paddle/fluid/framework/selected_rows_utils.cc b/paddle/fluid/framework/selected_rows_utils.cc
index 3f72ced811390..a4e1e91940443 100644
--- a/paddle/fluid/framework/selected_rows_utils.cc
+++ b/paddle/fluid/framework/selected_rows_utils.cc
@@ -14,8 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/selected_rows_utils.h"
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 
 void SerializeToStream(std::ostream& os,
                        const phi::SelectedRows& selected_rows,
@@ -95,5 +94,4 @@ void DeserializeFromStream(std::istream& is,
       is, selected_rows->mutable_value(), dev_ctx);
 }
 
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
diff --git a/paddle/fluid/framework/shape_inference.cc b/paddle/fluid/framework/shape_inference.cc
index 3f90bf08ac713..c3a940f877f89 100644
--- a/paddle/fluid/framework/shape_inference.cc
+++ b/paddle/fluid/framework/shape_inference.cc
@@ -16,8 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/enforce.h"
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 
 std::vector<DDim> InferShapeContext::GetReaderDims(
     const std::string &name) const {
@@ -45,5 +44,4 @@ void InferShapeContext::SetReaderDims(const std::string &name,
   return this->SetRepeatedDims(arg_names[0], dims);
 }
 
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
diff --git a/paddle/fluid/framework/string_array.cc b/paddle/fluid/framework/string_array.cc
index e701a423abd82..96aa8d04988aa 100644
--- a/paddle/fluid/framework/string_array.cc
+++ b/paddle/fluid/framework/string_array.cc
@@ -20,8 +20,7 @@ limitations under the License. */
 
 #include "glog/logging.h"
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 
 std::wstring_convert<std::codecvt_utf8<wchar_t>> kConverter;
 
@@ -101,5 +100,4 @@ void StringMapFromStream(std::istream& is,
   }
 }
 
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
diff --git a/paddle/fluid/framework/transfer_scope_cache.cc b/paddle/fluid/framework/transfer_scope_cache.cc
index 60c2516c0047d..90d5786a12d2c 100644
--- a/paddle/fluid/framework/transfer_scope_cache.cc
+++ b/paddle/fluid/framework/transfer_scope_cache.cc
@@ -14,8 +14,7 @@
 
 #include "paddle/fluid/framework/transfer_scope_cache.h"
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 
 std::unordered_map<size_t, Scope*>& global_transfer_data_cache() {
   thread_local auto* x = new std::unordered_map<size_t, Scope*>;
@@ -57,5 +56,4 @@ Scope* TryCreateTransferScope(const phi::KernelKey& type0,
   return new_scope;
 }
 
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
diff --git a/paddle/fluid/framework/variable_helper.cc b/paddle/fluid/framework/variable_helper.cc
index 6f0beeb0b7311..5bde58e1c327d 100644
--- a/paddle/fluid/framework/variable_helper.cc
+++ b/paddle/fluid/framework/variable_helper.cc
@@ -24,8 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/string_array.h"
 #include "paddle/fluid/platform/place.h"
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 
 void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
   if (var_type == proto::VarType::LOD_TENSOR) {
@@ -86,5 +85,4 @@ void CopyVariable(const Variable &src_var, Variable *dst_var) {
   }
 }
 
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
diff --git a/paddle/fluid/imperative/data_loader.cc b/paddle/fluid/imperative/data_loader.cc
index 0e778ca8d7184..99ed4ca01287e 100644
--- a/paddle/fluid/imperative/data_loader.cc
+++ b/paddle/fluid/imperative/data_loader.cc
@@ -26,8 +26,7 @@
 #include "paddle/fluid/memory/allocation/mmap_allocator.h"
 #include "paddle/fluid/platform/enforce.h"
 
-namespace paddle {
-namespace imperative {
+namespace paddle::imperative {
 
 static std::map<int64_t, std::set<pid_t>> load_process_pids;
 
@@ -193,7 +192,6 @@ void ThrowErrorIfLoadProcessFailed() {
   }
 }
 
-}  // namespace imperative
-}  // namespace paddle
+}  // namespace paddle::imperative
 
 #endif
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index a3c5b51b80b3b..383994cac6dfb 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -31,8 +31,7 @@
 #endif
 
 COMMON_DECLARE_bool(use_mkldnn);
-namespace paddle {
-namespace imperative {
+namespace paddle::imperative {
 
 using framework::Variable;
 void ThreadSafeNameSet::Insert(const std::string& name) {
@@ -639,5 +638,4 @@ std::shared_ptr<GradOpNode> CreateGradOpNode(
   return nullptr;
 }
 
-}  // namespace imperative
-}  // namespace paddle
+}  // namespace paddle::imperative
diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc
index 3ed9b97bfc362..f748948a8cf2b 100644
--- a/paddle/fluid/imperative/nccl_context.cc
+++ b/paddle/fluid/imperative/nccl_context.cc
@@ -33,14 +33,11 @@
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/place.h"
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 class Variable;
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
 
-namespace paddle {
-namespace imperative {
+namespace paddle::imperative {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 
 void NCCLParallelContext::BcastNCCLId(
@@ -232,5 +229,4 @@ void NCCLParallelContext::SynchronizeCompute() {
 
 #endif
 
-}  //  namespace imperative
-}  //  namespace paddle
+}  // namespace paddle::imperative
diff --git a/paddle/fluid/imperative/partial_grad_engine.cc b/paddle/fluid/imperative/partial_grad_engine.cc
index 8811d402c20dd..ec46d19390a91 100644
--- a/paddle/fluid/imperative/partial_grad_engine.cc
+++ b/paddle/fluid/imperative/partial_grad_engine.cc
@@ -38,8 +38,7 @@
 
 COMMON_DECLARE_bool(sort_sum_gradient);
 
-namespace paddle {
-namespace imperative {
+namespace paddle::imperative {
 
 struct HashPair {
   template <class T1, class T2>
@@ -1184,5 +1183,4 @@ void PartialGradEngine::Execute() {
   Clear();
 }
 
-}  // namespace imperative
-}  // namespace paddle
+}  // namespace paddle::imperative
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 9f4f46c60cea4..2a39e664276ed 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -39,8 +39,7 @@ COMMON_DECLARE_bool(check_nan_inf);
 PD_DECLARE_bool(benchmark);
 COMMON_DECLARE_bool(run_kp_kernel);
 
-namespace paddle {
-namespace imperative {
+namespace paddle::imperative {
 
 static const phi::Kernel empty_kernel;
 static const framework::RuntimeContext empty_ctx({}, {});
@@ -752,5 +751,4 @@ void PreparedOp::Run(const NameVarMap<egr::EagerVariable>& ins,
   }
 }
 
-}  // namespace imperative
-}  // namespace paddle
+}  // namespace paddle::imperative
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index abb6c491af3f1..b4e08a47b8efa 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -30,9 +30,7 @@
 #include "paddle/phi/common/data_type.h"
 #include "paddle/utils/string/pretty_log.h"
 
-namespace paddle {
-namespace inference {
-namespace analysis {
+namespace paddle::inference::analysis {
 using string::PrettyLogEndl;
 using string::Style;
 
@@ -341,6 +339,4 @@ std::unique_ptr<Graph> IRPassManager::Apply(std::unique_ptr<Graph> graph) {
   return graph;
 }
 
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::analysis
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
index ea97be8f90a60..60634f75df3ab 100644
--- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
@@ -22,12 +22,12 @@
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/enforce.h"
 
-namespace paddle {
-namespace inference {
+namespace paddle::inference {
 
 extern void ReadBinaryFile(const std::string &filename, std::string *contents);
 
-namespace analysis {
+}  // namespace paddle::inference
+namespace paddle::inference::analysis {
 
 void IrGraphBuildPass::RunImpl(Argument *argument) {
   if (!argument->scope_valid()) {
@@ -130,6 +130,4 @@ std::unique_ptr<framework::ProgramDesc> IrGraphBuildPass::LoadModel(
 
 std::string IrGraphBuildPass::repr() const { return "ir_graph_build_pass"; }
 
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::analysis
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc
index 2f7f61406b384..5399cf631f1df 100644
--- a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc
@@ -18,9 +18,7 @@
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/program_desc.h"
 
-namespace paddle {
-namespace inference {
-namespace analysis {
+namespace paddle::inference::analysis {
 
 void IrGraphToProgramPass::RunImpl(Argument *argument) {
   auto cache_pass =
@@ -44,6 +42,4 @@ void IrGraphToProgramPass::RunImpl(Argument *argument) {
       new framework::proto::ProgramDesc(*desc.Proto()));
 }
 
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::analysis
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
index 2e722f9a7e6e9..cc512a234602b 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -32,9 +32,7 @@ PD_DEFINE_bool(  // NOLINT
     false,
     "Keep old mode for developers, the model is saved on cpu not device.");
 
-namespace paddle {
-namespace inference {
-namespace analysis {
+namespace paddle::inference::analysis {
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
@@ -236,6 +234,4 @@ std::string IrParamsSyncAmongDevicesPass::repr() const {
   return "ir_params_sync_among_devices_pass";
 }
 
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::analysis
diff --git a/paddle/fluid/inference/analysis/passes/save_optimized_model_pass.cc b/paddle/fluid/inference/analysis/passes/save_optimized_model_pass.cc
index aaf9439d2b9ed..e8b8c27a24e58 100644
--- a/paddle/fluid/inference/analysis/passes/save_optimized_model_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/save_optimized_model_pass.cc
@@ -20,9 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/scope.h"
 
-namespace paddle {
-namespace inference {
-namespace analysis {
+namespace paddle::inference::analysis {
 
 void SaveOptimizedModelPass::SaveOptimizedModel(Argument* argument) {
   std::string model_opt_cache_dir = argument->optimized_model_save_path();
@@ -137,6 +135,4 @@ std::string SaveOptimizedModelPass::repr() const {
   return "save_optimized_model_pass";
 }
 
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::analysis
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index f47a9d166bf2d..cefe3d74fec00 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -37,6 +37,10 @@
 COMMON_DECLARE_uint64(initial_gpu_memory_in_mb);
 #endif
 
+#ifdef PADDLE_WITH_CINN
+COMMON_DECLARE_bool(use_cinn);
+#endif
+
 namespace paddle {
 struct MkldnnQuantizerConfig;
 
@@ -1552,7 +1556,13 @@ void AnalysisConfig::EnableCINN() {
 #endif
 }
 
-bool AnalysisConfig::cinn_enabled() const { return use_cinn_; }
+bool AnalysisConfig::cinn_enabled() const {
+  bool is_enabled = use_cinn_;
+#ifdef PADDLE_WITH_CINN
+  is_enabled = is_enabled || FLAGS_use_cinn;
+#endif
+  return is_enabled;
+}
 
 void AnalysisConfig::EnableCustomPasses(const std::vector<std::string> &passes,
                                         bool custom_pass_only) {
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index adb7021633b8e..7a211edc2a699 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -28,6 +28,7 @@
 #include "paddle/fluid//platform/device/gpu/gpu_types.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/feed_hook.h"
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/naive_executor.h"
@@ -1444,7 +1445,9 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
     LOG(ERROR) << "fail to set feed";
     return false;
   }
-
+  if (config_.new_ir_enabled()) {
+    ::paddle::framework::RunFeedHooks(*pir_program_, *scope);
+  }
 #ifdef PADDLE_WITH_TENSORRT
   if (config_.tensorrt_engine_enabled()) {
     inference::tensorrt::TensorRTEngine::predictor_id_per_thread =
@@ -1519,7 +1522,9 @@ bool AnalysisPredictor::Run(const std::vector<paddle::Tensor> &inputs,
     LOG(ERROR) << "fail to set feed";
     return false;
   }
-
+  if (config_.new_ir_enabled()) {
+    ::paddle::framework::RunFeedHooks(*pir_program_, *scope);
+  }
 #ifdef PADDLE_WITH_TENSORRT
   if (config_.tensorrt_engine_enabled()) {
     inference::tensorrt::TensorRTEngine::predictor_id_per_thread =
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index 5f9f8a5284e6e..b042f27ac9845 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -212,6 +212,12 @@ if(NOT WIN32)
     set(DEPS
         ${DEPS} ${PADDLE_LIB}/paddle/lib/libphi${CMAKE_SHARED_LIBRARY_SUFFIX}
         ${PADDLE_LIB}/paddle/lib/libcommon${CMAKE_SHARED_LIBRARY_SUFFIX})
+    if(WITH_GPU OR WITH_ROCM)
+      set(DEPS
+          ${DEPS}
+          ${PADDLE_LIB}/paddle/lib/libphi_kernel_gpu${CMAKE_SHARED_LIBRARY_SUFFIX}
+      )
+    endif()
   endif()
 else()
   set(DEPS
diff --git a/paddle/fluid/inference/api/demo_ci/clean.sh b/paddle/fluid/inference/api/demo_ci/clean.sh
index c265721db5775..8901cd16b5e1d 100755
--- a/paddle/fluid/inference/api/demo_ci/clean.sh
+++ b/paddle/fluid/inference/api/demo_ci/clean.sh
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh
index e1369ca51c5d0..d72b6bfadf6bf 100755
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -184,7 +184,7 @@ for WITH_STATIC_LIB in ON OFF; do
         fi
       done
     done
-    
+
     # --------tensorrt mobilenet on windows------
     if [ $USE_TENSORRT == ON -a $TEST_GPU_CPU == ON ]; then
       rm -rf *
@@ -203,7 +203,7 @@ for WITH_STATIC_LIB in ON OFF; do
       ./trt_mobilenet_demo.exe \
         --modeldir=$DATA_DIR/mobilenet/model \
         --data=$DATA_DIR/mobilenet/data.txt \
-        --refer=$DATA_DIR/mobilenet/result.txt 
+        --refer=$DATA_DIR/mobilenet/result.txt
       if [ $? -ne 0 ]; then
         echo "trt_mobilenet_demo runs failed." >> ${current_dir}/test_summary.txt
         EXIT_CODE=1
@@ -268,7 +268,7 @@ for WITH_STATIC_LIB in ON OFF; do
       ./trt_mobilenet_demo \
         --modeldir=$DATA_DIR/mobilenet/model \
         --data=$DATA_DIR/mobilenet/data.txt \
-        --refer=$DATA_DIR/mobilenet/result.txt 
+        --refer=$DATA_DIR/mobilenet/result.txt
       if [ $? -ne 0 ]; then
         echo "trt_mobilenet_demo runs failed " >> ${current_dir}/test_summary.txt
         EXIT_CODE=1
diff --git a/paddle/fluid/inference/api/demo_ci/run_windows_demo.bat b/paddle/fluid/inference/api/demo_ci/run_windows_demo.bat
index 6eb932a190654..4bb859becf70c 100644
--- a/paddle/fluid/inference/api/demo_ci/run_windows_demo.bat
+++ b/paddle/fluid/inference/api/demo_ci/run_windows_demo.bat
@@ -65,12 +65,12 @@ if /i "%use_gpu%"=="Y" (
   set use_gpu=N
 )
 
-rem set_path_vs_command_prompt 
+rem set_path_vs_command_prompt
 :set_vcvarsall_dir
 SET /P vcvarsall_dir="Please input the path of visual studio command Prompt, such as C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat   =======>"
 set tmp_var=!vcvarsall_dir!
 call:remove_space
-set vcvarsall_dir=!tmp_var!   
+set vcvarsall_dir=!tmp_var!
 IF NOT EXIST "%vcvarsall_dir%" (
     echo "------------%vcvarsall_dir% not exist------------"
     goto set_vcvarsall_dir
@@ -104,18 +104,18 @@ if EXIST "%source_path%\%model_name%.tar.gz" (
     SET /P python_path="Please input the path of python.exe, such as C:\Python37\python.exe =======>"
     set tmp_var=!python_path!
     call:remove_space
-    set python_path=!tmp_var!   
+    set python_path=!tmp_var!
     if "!python_path!"=="" (
       set python_path=python.exe
     ) else (
       if NOT exist "!python_path!" (
-        echo "------------!python_path! not exist------------" 
+        echo "------------!python_path! not exist------------"
         goto:eof
-      )  
+      )
     )
     md %source_path%\%model_name%
     !python_path! %source_path%\untar_model.py %source_path%\%model_name%.tar.gz %source_path%\%model_name%
-    
+
     SET error_code=N
     if "%model_name%"=="mobilenet" (
       if NOT EXIST "%source_path%\%model_name%\model" set error_code=Y
@@ -127,7 +127,7 @@ if EXIST "%source_path%\%model_name%.tar.gz" (
        del /f /s /q "%source_path%\%model_name%\*.*" >nul 2>&1
        rd /s /q  "%source_path%\%model_name%" >nul 2>&1
        goto:eof
-    )  
+    )
   )
 )
 
@@ -201,7 +201,7 @@ if /i "%use_gpu%"=="Y" (
 )
 
 if exist "%build_path%\Release\%demo_name%.exe" (
-  cd %build_path%\Release 
+  cd %build_path%\Release
   set GLOG_v=4
   if "%demo_name%"=="simple_on_word2vec" (
       %demo_name%.exe --dirname="%source_path%\%model_name%\%model_name%" --use_gpu="%use_gpu%"
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index 57f8066df1eeb..d8206093efa53 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -60,11 +60,6 @@ void Tensor::Reshape(const std::vector<int> &shape) {
           "No tensor called [%s] in the runtime scope", name_));
   auto *tensor = var->GetMutable<phi::DenseTensor>();
   tensor->Resize(common::make_ddim(shape));
-#ifdef PADDLE_WITH_DNNL
-  if (tensor->layout() == phi::DataLayout::ONEDNN) {
-    tensor->set_layout(phi::DataLayout::ANY);
-  }
-#endif
 }
 
 void Tensor::ReshapeStrings(const size_t &shape) {
@@ -212,11 +207,6 @@ void Tensor::CopyFromCpu(const T *data) {
   if (place_ == PlaceType::kCPU) {
     auto *t_data = tensor->mutable_data<T>(paddle::platform::CPUPlace());
     std::memcpy(static_cast<void *>(t_data), data, ele_size);
-#ifdef PADDLE_WITH_DNNL
-    if (tensor->layout() == phi::DataLayout::ONEDNN) {
-      tensor->set_layout(phi::DataLayout::ANY);
-    }
-#endif
   } else if (place_ == PlaceType::kGPU) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
diff --git a/paddle/fluid/inference/api/helper.cc b/paddle/fluid/inference/api/helper.cc
index 416a62e980fe5..9823c45195361 100644
--- a/paddle/fluid/inference/api/helper.cc
+++ b/paddle/fluid/inference/api/helper.cc
@@ -34,8 +34,7 @@
 #include "paddle/pir/include/core/operation.h"
 #include "paddle/pir/include/core/value.h"
 
-namespace paddle {
-namespace inference {
+namespace paddle::inference {
 
 template <>
 std::string to_string<std::vector<float>>(
@@ -446,5 +445,4 @@ void InitGflagsFromEnv() {
   framework::InitGflags(gflags);
 }
 
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference
diff --git a/paddle/fluid/inference/api/paddle_infer_contrib.cc b/paddle/fluid/inference/api/paddle_infer_contrib.cc
index 8d5e1e6ce1cae..00da2279917e2 100644
--- a/paddle/fluid/inference/api/paddle_infer_contrib.cc
+++ b/paddle/fluid/inference/api/paddle_infer_contrib.cc
@@ -20,8 +20,7 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"
 
-namespace paddle_infer {
-namespace contrib {
+namespace paddle_infer::contrib {
 
 using paddle::PaddleDType;
 
@@ -290,5 +289,4 @@ bool Status::operator!=(const Status& x) const noexcept {
   return !(*this == x);
 }
 
-}  // namespace contrib
-}  // namespace paddle_infer
+}  // namespace paddle_infer::contrib
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index a296074f9d6cf..905144110386f 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -538,6 +538,7 @@ XpuPassStrategy::XpuPassStrategy() : PassStrategy({}) {
       "group_norm_silu_xpu_fuse_pass",
       "embedding_with_eltwise_add_xpu_fuse_pass",
       "qk_qkv_attention_xpu_fuse_pass",
+      "block_multihead_attention_xpu_pass",
       "multi_encoder_xpu_fuse_pass",
       "multi_encoder_xpu_adaptive_seqlen_fuse_pass",
       "multi_encoder_xpu_slice_fuse_pass",
@@ -613,11 +614,14 @@ const std::vector<std::string> kPirGpuPasses{
     "fused_weight_only_linear_pass",
     "matmul_add_act_fuse_pass",
     "fc_elementwise_layernorm_fuse_pass",
+    "add_norm_fuse_pass",
+    "group_norm_silu_fuse_pass",
     "matmul_scale_fuse_pass",
     "matmul_transpose_fuse_pass",
     "transpose_flatten_concat_fuse_pass",
     "remove_redundant_transpose_pass",
-    "transfer_layout_pass"};
+    "transfer_layout_pass",
+};
 
 const std::vector<std::string> kPirXpuPasses{// Functional pass
                                              "map_op_to_another_pass",
@@ -625,7 +629,7 @@ const std::vector<std::string> kPirXpuPasses{// Functional pass
                                              // Operator fusion pass
                                              "add_layernorm_xpu_fuse_pass",
                                              "conv2d_bn_xpu_fuse_pass",
-                                             "group_norm_silu_xpu_fuse_pass"};
+                                             "group_norm_silu_fuse_pass"};
 
 const std::vector<std::string> kPirMkldnnPasses {
   "depthwise_conv_onednn_pass",              //
diff --git a/paddle/fluid/inference/check_symbol.sh b/paddle/fluid/inference/check_symbol.sh
index 0d8892f20514f..ea7f66a5f729c 100755
--- a/paddle/fluid/inference/check_symbol.sh
+++ b/paddle/fluid/inference/check_symbol.sh
@@ -1,13 +1,13 @@
 #!/bin/sh
 
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/paddle/fluid/inference/experimental/javaapi/test.sh b/paddle/fluid/inference/experimental/javaapi/test.sh
index d664ee240375a..343c2dfa59cdd 100644
--- a/paddle/fluid/inference/experimental/javaapi/test.sh
+++ b/paddle/fluid/inference/experimental/javaapi/test.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/paddle/fluid/inference/goapi/test.sh b/paddle/fluid/inference/goapi/test.sh
index fbde661d177f7..79cf3e5a74378 100644
--- a/paddle/fluid/inference/goapi/test.sh
+++ b/paddle/fluid/inference/goapi/test.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -17,7 +17,7 @@
 # 1. download the mobilenetv1 model to test config and predictor
 if [ ! -d mobilenetv1 ]; then
     wget https://paddle-inference-dist.bj.bcebos.com/Paddle-Inference-Demo/mobilenetv1.tgz
-    tar xzf mobilenetv1.tgz 
+    tar xzf mobilenetv1.tgz
 fi
 
 # 2. set LD_LIBRARY_PATH
diff --git a/paddle/fluid/inference/paddle_inference.map b/paddle/fluid/inference/paddle_inference.map
index 267dcf7fb601d..180d4e643ba23 100644
--- a/paddle/fluid/inference/paddle_inference.map
+++ b/paddle/fluid/inference/paddle_inference.map
@@ -71,7 +71,7 @@
 			/* *paddle::framework*; */
 			*paddle::framework::InitDevices*;
 			*paddle::framework::InitMemoryMethod*;
-                        
+
 			*paddle::framework::InterpreterCore*;
 			*paddle::framework::Executor*;
 			*paddle::framework::proto*;
diff --git a/paddle/fluid/inference/tensorrt/convert/bitwise_not_op.cc b/paddle/fluid/inference/tensorrt/convert/bitwise_not_op.cc
index 63a02d4e393e8..77b829228e5f0 100644
--- a/paddle/fluid/inference/tensorrt/convert/bitwise_not_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/bitwise_not_op.cc
@@ -17,9 +17,7 @@ limitations under the License. */
 #include <iostream>
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 
 class BitwiseNotConverter : public OpConverter {
  public:
@@ -73,8 +71,6 @@ class BitwiseNotConverter : public OpConverter {
   }
 };
 
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
 
 REGISTER_TRT_OP_CONVERTER(bitwise_not, BitwiseNotConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/bitwise_or_op.cc b/paddle/fluid/inference/tensorrt/convert/bitwise_or_op.cc
index 814ee8bd98551..9c5beb4634035 100644
--- a/paddle/fluid/inference/tensorrt/convert/bitwise_or_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/bitwise_or_op.cc
@@ -17,9 +17,7 @@
 #include <iostream>
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 
 class BitwiseOrConverter : public OpConverter {
  public:
@@ -53,8 +51,6 @@ class BitwiseOrConverter : public OpConverter {
   }
 };
 
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
 
 REGISTER_TRT_OP_CONVERTER(bitwise_or, BitwiseOrConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/c_allreduce_op.cc b/paddle/fluid/inference/tensorrt/convert/c_allreduce_op.cc
index 767cf996f7d7f..e84c18a79c4b3 100644
--- a/paddle/fluid/inference/tensorrt/convert/c_allreduce_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/c_allreduce_op.cc
@@ -16,9 +16,7 @@
 #include "paddle/fluid/inference/tensorrt/plugin/c_allreduce_op_plugin.h"
 #include "paddle/phi/common/data_type.h"
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 using ReduceType = paddle::inference::tensorrt::plugin::ReduceType;
 std::map<std::string, ReduceType> op_to_reduce_type = {
     {"c_allreduce_sum", paddle::inference::tensorrt::plugin::kRedSum},
@@ -88,9 +86,7 @@ class CAllReduceOpConverter : public OpConverter {
   }
 };
 
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
 
 REGISTER_TRT_OP_CONVERTER(c_allreduce_sum, CAllReduceOpConverter);
 REGISTER_TRT_OP_CONVERTER(c_allreduce_max, CAllReduceOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/celu_op.cc b/paddle/fluid/inference/tensorrt/convert/celu_op.cc
index 837364a9feca7..d2279f9610b2d 100644
--- a/paddle/fluid/inference/tensorrt/convert/celu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/celu_op.cc
@@ -14,9 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 
 class CeluOpConverter : public OpConverter {
  public:
@@ -82,8 +80,6 @@ class CeluOpConverter : public OpConverter {
   }
 };
 
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
 
 REGISTER_TRT_OP_CONVERTER(celu, CeluOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/concat_op.cc b/paddle/fluid/inference/tensorrt/convert/concat_op.cc
index 6f4fdc30214b5..f1d66090eeb3c 100644
--- a/paddle/fluid/inference/tensorrt/convert/concat_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/concat_op.cc
@@ -14,9 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 
 /*
  * ConcatOp
@@ -53,8 +51,6 @@ class ConcatOpConverter : public OpConverter {
   }
 };
 
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
 
 REGISTER_TRT_OP_CONVERTER(concat, ConcatOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/conv3d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv3d_op.cc
index 37a53d31f47b5..547ec74c19fa6 100644
--- a/paddle/fluid/inference/tensorrt/convert/conv3d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv3d_op.cc
@@ -14,9 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 
 template <typename RegistFunc, typename SetDilationFunc>
 void ConvertConv3d(TensorRTEngine* engine,
@@ -192,9 +190,7 @@ class Deconv3dOpConverter : public OpConverter {
   }
 };
 
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
 
 REGISTER_TRT_OP_CONVERTER(conv3d, Conv3dOpConverter);
 REGISTER_TRT_OP_CONVERTER(conv3d_transpose, Deconv3dOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/fill_constant_op.cc b/paddle/fluid/inference/tensorrt/convert/fill_constant_op.cc
index 3b9cc9dd0d349..79e40a80f9531 100644
--- a/paddle/fluid/inference/tensorrt/convert/fill_constant_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fill_constant_op.cc
@@ -14,9 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 
 class FillConstantOpConverter : public OpConverter {
  public:
@@ -124,8 +122,6 @@ class FillConstantOpConverter : public OpConverter {
   }
 };
 
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
 
 REGISTER_TRT_OP_CONVERTER(fill_constant, FillConstantOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/flip_op.cc b/paddle/fluid/inference/tensorrt/convert/flip_op.cc
index 0ac714507b5ce..de162b7fbd9ee 100644
--- a/paddle/fluid/inference/tensorrt/convert/flip_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/flip_op.cc
@@ -14,9 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 
 class FlipOpConverter : public OpConverter {
  public:
@@ -76,8 +74,6 @@ class FlipOpConverter : public OpConverter {
   }
 };
 
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
 
 REGISTER_TRT_OP_CONVERTER(flip, FlipOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/fused_token_prune_op.cc b/paddle/fluid/inference/tensorrt/convert/fused_token_prune_op.cc
index e9d4ae9182095..f015e809dc210 100644
--- a/paddle/fluid/inference/tensorrt/convert/fused_token_prune_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fused_token_prune_op.cc
@@ -12,9 +12,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.h"
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 
 class FusedTokenPruneOpConverter : public OpConverter {
  public:
@@ -109,8 +107,6 @@ class FusedTokenPruneOpConverter : public OpConverter {
   }
 };
 
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
 
 REGISTER_TRT_OP_CONVERTER(fused_token_prune, FusedTokenPruneOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/gather_nd_op.cc b/paddle/fluid/inference/tensorrt/convert/gather_nd_op.cc
index 508d7a5f9b390..2a70b7b524973 100644
--- a/paddle/fluid/inference/tensorrt/convert/gather_nd_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/gather_nd_op.cc
@@ -15,9 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h"
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 
 class GatherNdOpConverter : public OpConverter {
  public:
@@ -63,8 +61,6 @@ class GatherNdOpConverter : public OpConverter {
   }
 };
 
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
 
 REGISTER_TRT_OP_CONVERTER(gather_nd, GatherNdOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/grid_sampler_op.cc b/paddle/fluid/inference/tensorrt/convert/grid_sampler_op.cc
index a2fe27590df02..adf2f10584805 100644
--- a/paddle/fluid/inference/tensorrt/convert/grid_sampler_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/grid_sampler_op.cc
@@ -14,9 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 
 /*
  * GridSampler Op
@@ -81,8 +79,6 @@ class GridSamplerOpConverter : public OpConverter {
   }
 };
 
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
 
 REGISTER_TRT_OP_CONVERTER(grid_sampler, GridSamplerOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/index_select_op.cc b/paddle/fluid/inference/tensorrt/convert/index_select_op.cc
index 9ee875c92445e..6f869b38b1924 100644
--- a/paddle/fluid/inference/tensorrt/convert/index_select_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/index_select_op.cc
@@ -14,19 +14,15 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 class Scope;
 
-namespace proto {
+}  // namespace paddle::framework
+namespace paddle::framework::proto {
 class OpDesc;
-}  // namespace proto
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::proto
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 
 /*
  * Gather Op
@@ -68,8 +64,6 @@ class IndexSelectConverter : public OpConverter {
   }
 };
 
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
 
 REGISTER_TRT_OP_CONVERTER(index_select, IndexSelectConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/instance_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/instance_norm_op.cc
index bd97df48309c7..384183d11f51c 100644
--- a/paddle/fluid/inference/tensorrt/convert/instance_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/instance_norm_op.cc
@@ -15,9 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.h"
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 
 class InstanceNormOpConverter : public OpConverter {
  public:
@@ -77,8 +75,6 @@ class InstanceNormOpConverter : public OpConverter {
   }
 };
 
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
 
 REGISTER_TRT_OP_CONVERTER(instance_norm, InstanceNormOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
index d3fda4cb24e28..f505c36b2ed5c 100644
--- a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
@@ -14,9 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 
 // LeakyRelu converter from fluid to tensorRT
 class LeakyReluOpConverter : public OpConverter {
@@ -121,8 +119,6 @@ class LeakyReluOpConverter : public OpConverter {
   }
 };
 
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
 
 REGISTER_TRT_OP_CONVERTER(leaky_relu, LeakyReluOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/lookup_table_op.cc b/paddle/fluid/inference/tensorrt/convert/lookup_table_op.cc
index cdb49be72f50f..b86139b6b6476 100644
--- a/paddle/fluid/inference/tensorrt/convert/lookup_table_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/lookup_table_op.cc
@@ -14,9 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 
 class LookupTableOpConverter : public OpConverter {
  public:
@@ -72,9 +70,7 @@ class LookupTableV2OpConverter : public OpConverter {
   }
 };
 
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
 
 REGISTER_TRT_OP_CONVERTER(lookup_table, LookupTableOpConverter);
 REGISTER_TRT_OP_CONVERTER(lookup_table_v2, LookupTableV2OpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/matrix_multiply_op.cc b/paddle/fluid/inference/tensorrt/convert/matrix_multiply_op.cc
index 16d6f3f20750c..fd72f8b78f9af 100644
--- a/paddle/fluid/inference/tensorrt/convert/matrix_multiply_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/matrix_multiply_op.cc
@@ -12,9 +12,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/phi/common/data_type.h"
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 
 /*
  * After trt_map_ops_to_matrix_multiply_pass(mul, matmul, matmul_v2 ->
@@ -266,8 +264,6 @@ class MatrixMultiplyOpConverter : public OpConverter {
   }
 };
 
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
 
 REGISTER_TRT_OP_CONVERTER(matrix_multiply, MatrixMultiplyOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/multiclass_nms3_op.cc b/paddle/fluid/inference/tensorrt/convert/multiclass_nms3_op.cc
index 107217477d14f..f2d00ab4b4667 100644
--- a/paddle/fluid/inference/tensorrt/convert/multiclass_nms3_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/multiclass_nms3_op.cc
@@ -13,9 +13,7 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 
 class MultiClassNMS3OpConverter : public OpConverter {
  public:
@@ -170,8 +168,6 @@ class MultiClassNMS3OpConverter : public OpConverter {
   }
 };
 
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
 
 REGISTER_TRT_OP_CONVERTER(multiclass_nms3, MultiClassNMS3OpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
index 68f18bd6e7472..2ea04f6fcfd3d 100644
--- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
@@ -16,9 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h"
 #include "paddle/fluid/inference/tensorrt/plugin/transformer_input_output_convert_plugin.h"
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 
 class MultiheadMatMulOpConverter : public OpConverter {
  public:
@@ -960,8 +958,6 @@ class MultiheadMatMulOpConverter : public OpConverter {
   }
 };
 
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
 
 REGISTER_TRT_OP_CONVERTER(multihead_matmul, MultiheadMatMulOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc
index c0f38cf79ff91..1e7514389e2ea 100644
--- a/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc
@@ -14,9 +14,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.h"
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 
 inline void DealCeilMode(const nvinfer1::Dims &input_shape,
                          std::vector<int> ksize,
@@ -228,9 +226,7 @@ class Pool3dOpConverter : public OpConverter {
   }
 };
 
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
 
 USE_OP_ITSELF(pool3d);
 REGISTER_TRT_OP_CONVERTER(pool3d, Pool3dOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc
index 9f9cbe7c6bceb..4c73c5c897570 100644
--- a/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc
@@ -14,9 +14,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/inference/tensorrt/plugin/many_emb_layernorm_varseqlen_plugin.h"
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 
 class PrelnEmbEltwiseLayerNormOpConverter : public OpConverter {
  public:
@@ -237,9 +235,7 @@ class PrelnEmbEltwiseLayerNormOpConverter : public OpConverter {
   }
 };
 
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
 
 REGISTER_TRT_OP_CONVERTER(fused_preln_embedding_eltwise_layernorm,
                           PrelnEmbEltwiseLayerNormOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/preln_residual_bias.cc b/paddle/fluid/inference/tensorrt/convert/preln_residual_bias.cc
index 824f0ff902874..5e2c32f5c7d31 100644
--- a/paddle/fluid/inference/tensorrt/convert/preln_residual_bias.cc
+++ b/paddle/fluid/inference/tensorrt/convert/preln_residual_bias.cc
@@ -16,9 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/tensorrt/plugin/preln_residual_bias_plugin.h"
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 
 using half = paddle::platform::float16;
 class PrelnResidualBiasOpConverter : public OpConverter {
@@ -105,9 +103,7 @@ class PrelnResidualBiasOpConverter : public OpConverter {
   }
 };
 
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
 
 REGISTER_TRT_OP_CONVERTER(fused_bias_dropout_residual_layer_norm,
                           PrelnResidualBiasOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/prompt_tuning_emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/prompt_tuning_emb_eltwise_layernorm.cc
index e6beaae910d96..6c83913f7d888 100644
--- a/paddle/fluid/inference/tensorrt/convert/prompt_tuning_emb_eltwise_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/prompt_tuning_emb_eltwise_layernorm.cc
@@ -16,9 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/inference/tensorrt/plugin/prompt_tuning_emb_layernorm_varseqlen_plugin.h"
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 
 class PromptTuningEmbEltwiseLayerNormOpConverter : public OpConverter {
  public:
@@ -168,9 +166,7 @@ class PromptTuningEmbEltwiseLayerNormOpConverter : public OpConverter {
   }
 };
 
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
 
 REGISTER_TRT_OP_CONVERTER(prompt_tuning_emb_eltwise_layernorm,
                           PromptTuningEmbEltwiseLayerNormOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/qk_multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/qk_multihead_matmul_op.cc
index aafbec6660c67..2d7798878d971 100644
--- a/paddle/fluid/inference/tensorrt/convert/qk_multihead_matmul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/qk_multihead_matmul_op.cc
@@ -14,9 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 
 class QkMultiheadMatMulOpConverter : public OpConverter {
  public:
@@ -290,8 +288,6 @@ class QkMultiheadMatMulOpConverter : public OpConverter {
   }
 };
 
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
 
 REGISTER_TRT_OP_CONVERTER(qk_multihead_matmul, QkMultiheadMatMulOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/quantize_linear_op.cc b/paddle/fluid/inference/tensorrt/convert/quantize_linear_op.cc
index 74a8f56ea6c20..0ddcee9244925 100644
--- a/paddle/fluid/inference/tensorrt/convert/quantize_linear_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/quantize_linear_op.cc
@@ -12,9 +12,7 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 
 class QuantizeLinearOpConverter : public OpConverter {
  public:
@@ -60,8 +58,6 @@ class QuantizeLinearOpConverter : public OpConverter {
   }
 };
 
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
 
 REGISTER_TRT_OP_CONVERTER(quantize_linear, QuantizeLinearOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc b/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc
index 8fbdea5edd4c9..74f13234bf14d 100644
--- a/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc
@@ -15,9 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h"
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 
 /*
  * Roi Align Op
@@ -69,8 +67,6 @@ class RoiAlignOpConverter : public OpConverter {
   }
 };
 
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
 
 REGISTER_TRT_OP_CONVERTER(roi_align, RoiAlignOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/roll_op.cc b/paddle/fluid/inference/tensorrt/convert/roll_op.cc
index ca42b3c34c3f8..1d4d8a5b4a229 100644
--- a/paddle/fluid/inference/tensorrt/convert/roll_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/roll_op.cc
@@ -15,9 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 /*
  * Stack converter from fluid to tensorRT.
  */
@@ -91,8 +89,6 @@ class RollOpConverter : public OpConverter {
   }
 };
 
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
 
 REGISTER_TRT_OP_CONVERTER(roll, RollOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/share_data_op.cc b/paddle/fluid/inference/tensorrt/convert/share_data_op.cc
index 38fa1ff6e0c83..a309d3faa10ec 100644
--- a/paddle/fluid/inference/tensorrt/convert/share_data_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/share_data_op.cc
@@ -14,9 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 
 class ShareDataOpConverter : public OpConverter {
  public:
@@ -32,8 +30,6 @@ class ShareDataOpConverter : public OpConverter {
   }
 };
 
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
 
 REGISTER_TRT_OP_CONVERTER(share_data, ShareDataOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
index 988d0d064c862..e87bf699b15fd 100644
--- a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
@@ -17,9 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/phi/common/data_type.h"
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 
 class SkipLayerNormOpConverter : public OpConverter {
  public:
@@ -257,8 +255,6 @@ class SkipLayerNormOpConverter : public OpConverter {
   }
 };
 
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
 
 REGISTER_TRT_OP_CONVERTER(skip_layernorm, SkipLayerNormOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/skip_merge_layernorm_op.cc b/paddle/fluid/inference/tensorrt/convert/skip_merge_layernorm_op.cc
index 4bb54de495b19..a73be2eb3e3c6 100644
--- a/paddle/fluid/inference/tensorrt/convert/skip_merge_layernorm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/skip_merge_layernorm_op.cc
@@ -15,9 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/plugin/skip_merge_layernorm_op_plugin.h"
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 class SkipMergeLayernormOpConverter : public OpConverter {
  public:
   void operator()(const framework::proto::OpDesc& op,
@@ -87,8 +85,6 @@ class SkipMergeLayernormOpConverter : public OpConverter {
   }
 };
 
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
 
 REGISTER_TRT_OP_CONVERTER(skip_merge_layernorm, SkipMergeLayernormOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/stack_op.cc b/paddle/fluid/inference/tensorrt/convert/stack_op.cc
index 30ffcd88472d3..1e5dcdeac5019 100644
--- a/paddle/fluid/inference/tensorrt/convert/stack_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/stack_op.cc
@@ -15,9 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.h"
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 
 /*
  * Stack converter from fluid to tensorRT.
@@ -80,8 +78,6 @@ class StackOpConverter : public OpConverter {
   }
 };
 
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
 
 REGISTER_TRT_OP_CONVERTER(stack, StackOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/sum_op.cc b/paddle/fluid/inference/tensorrt/convert/sum_op.cc
index 900a37126f1ce..e9a1408185af2 100644
--- a/paddle/fluid/inference/tensorrt/convert/sum_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/sum_op.cc
@@ -14,9 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 
 class SumOpConverter : public OpConverter {
  public:
@@ -47,8 +45,6 @@ class SumOpConverter : public OpConverter {
   }
 };
 
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
 
 REGISTER_TRT_OP_CONVERTER(sum, SumOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
index 7ef6d1f3241d8..c9a69c8a7c624 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
@@ -19,9 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 
 TEST(OpConverter, ConvertBlock) {
   framework::ProgramDesc prog;
@@ -68,8 +66,6 @@ TEST(OpConverter, ConvertBlock) {
       *block->Proto(), {"conv2d-Y"}, scope, engine_.get() /*TensorRTEngine*/);
 }
 
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
 
 USE_TRT_CONVERTER(conv2d)
diff --git a/paddle/fluid/inference/tensorrt/convert/trans_layernorm_op.cc b/paddle/fluid/inference/tensorrt/convert/trans_layernorm_op.cc
index a5db8ed88c4c0..8251d3a3e745e 100644
--- a/paddle/fluid/inference/tensorrt/convert/trans_layernorm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/trans_layernorm_op.cc
@@ -11,9 +11,7 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/plugin/trans_layernorm_op_plugin.h"
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 
 class TransLayerNormOpConverter : public OpConverter {
  public:
@@ -84,8 +82,6 @@ class TransLayerNormOpConverter : public OpConverter {
   }
 };
 
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
 
 REGISTER_TRT_OP_CONVERTER(trans_layernorm, TransLayerNormOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/transformer_input_convert_op.cc b/paddle/fluid/inference/tensorrt/convert/transformer_input_convert_op.cc
index 1dca9bb818c38..f7fda67a3643f 100644
--- a/paddle/fluid/inference/tensorrt/convert/transformer_input_convert_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/transformer_input_convert_op.cc
@@ -15,9 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/plugin/transformer_input_output_convert_plugin.h"
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 
 /*
  * Convert Transformer Input(pos_id, max_seqlen).
@@ -58,8 +56,6 @@ class TransformerInputConvert : public OpConverter {
   }
 };
 
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
 
 REGISTER_TRT_OP_CONVERTER(transformer_input_convert, TransformerInputConvert);
diff --git a/paddle/fluid/inference/tensorrt/convert/transpose_op.cc b/paddle/fluid/inference/tensorrt/convert/transpose_op.cc
index 62ef6edd2230b..045a991492628 100644
--- a/paddle/fluid/inference/tensorrt/convert/transpose_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/transpose_op.cc
@@ -11,9 +11,7 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 
 /*
  * TransposeOp
@@ -48,9 +46,7 @@ class TransposeOpConverter : public OpConverter {
   }
 };
 
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
 
 REGISTER_TRT_OP_CONVERTER(transpose, TransposeOpConverter);
 REGISTER_TRT_OP_CONVERTER(transpose2, TransposeOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/unary_op.cc b/paddle/fluid/inference/tensorrt/convert/unary_op.cc
index ea78ec9292159..f720515acc2eb 100644
--- a/paddle/fluid/inference/tensorrt/convert/unary_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/unary_op.cc
@@ -23,9 +23,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/platform/enforce.h"
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 
 class UnaryOpConverter : public OpConverter {
  public:
@@ -216,9 +214,7 @@ class RoundOpConverter : public UnaryOpConverter {
 };
 #endif
 
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
 
 REGISTER_TRT_OP_CONVERTER(exp, ExpOpConverter);
 REGISTER_TRT_OP_CONVERTER(log, LogOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/unsqueeze2_op.cc b/paddle/fluid/inference/tensorrt/convert/unsqueeze2_op.cc
index 7cdc1b07fd04d..72f19d07f5a1f 100644
--- a/paddle/fluid/inference/tensorrt/convert/unsqueeze2_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/unsqueeze2_op.cc
@@ -14,9 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 
 class Unsqueeze2OpConverter : public OpConverter {
  public:
@@ -94,8 +92,6 @@ class Unsqueeze2OpConverter : public OpConverter {
   }
 };
 
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
 
 REGISTER_TRT_OP_CONVERTER(unsqueeze2, Unsqueeze2OpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/yolo_box_head_op.cc b/paddle/fluid/inference/tensorrt/convert/yolo_box_head_op.cc
index eafb38221ecf3..c03368c3f4bcc 100644
--- a/paddle/fluid/inference/tensorrt/convert/yolo_box_head_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/yolo_box_head_op.cc
@@ -12,18 +12,14 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/plugin/yolo_box_head_op_plugin.h"
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 class Scope;
-namespace proto {
+}  // namespace paddle::framework
+namespace paddle::framework::proto {
 class OpDesc;
-}  // namespace proto
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::proto
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 
 class YoloBoxHeadOpConverter : public OpConverter {
  public:
@@ -50,8 +46,6 @@ class YoloBoxHeadOpConverter : public OpConverter {
   }
 };
 
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
 
 REGISTER_TRT_OP_CONVERTER(yolo_box_head, YoloBoxHeadOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta.cc b/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta.cc
index e6bc25af044dc..28ef055897b80 100644
--- a/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta.cc
+++ b/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta.cc
@@ -17,9 +17,7 @@
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/kernels/funcs/unfold_functor.h"
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 
 class ExprWrapper {
  public:
@@ -124,6 +122,7 @@ static std::vector<ExprWrapper> DimsExprs2VecExprWrapper(
     nvinfer1::IExprBuilder& expr_builder  // NOLINT
 ) {
   std::vector<ExprWrapper> x_dims_wrap;
+  x_dims_wrap.reserve(x_dims.nbDims);
   for (int i = 0; i < x_dims.nbDims; i++) {
     x_dims_wrap.emplace_back(x_dims.d[i], &expr_builder);
   }
@@ -154,6 +153,7 @@ nvinfer1::DimsExprs GatherNdInferMeta(
   std::vector<const nvinfer1::IDimensionExpr*> result_dims;
   // The result dims is
   //   Index.shape[:-1] + X.shape[Index.shape[-1]:]
+  result_dims.reserve(index_dims_size - 1);
   for (int i = 0; i < index_dims_size - 1; ++i) {
     result_dims.emplace_back(index_dims.d[i]);
   }
@@ -899,6 +899,4 @@ PD_REGISTER_DYNAMIC_INFER_META_FN(pad, PadInferMeta);
 PD_REGISTER_DYNAMIC_INFER_META_FN(argsort, ArgsortInferMeta);
 PD_REGISTER_DYNAMIC_INFER_META_FN(scatter, ScatterInferMeta);
 PD_REGISTER_DYNAMIC_INFER_META_FN(solve, SolveInferMeta);
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
diff --git a/paddle/fluid/inference/tensorrt/plugin/test_split_plugin.cc b/paddle/fluid/inference/tensorrt/plugin/test_split_plugin.cc
index da9784fbb6487..64e55023892c4 100644
--- a/paddle/fluid/inference/tensorrt/plugin/test_split_plugin.cc
+++ b/paddle/fluid/inference/tensorrt/plugin/test_split_plugin.cc
@@ -16,10 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-namespace plugin {
+namespace paddle::inference::tensorrt::plugin {
 
 TEST(split_op_plugin, test_plugin) {
   int axis = 1;
@@ -60,7 +57,4 @@ TEST(split_op_plugin, test_plugin_creater) {
   creator.setPluginNamespace("test");
 }
 
-}  // namespace plugin
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt::plugin
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
index 637bd84deaff0..83941eb00cf22 100644
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
@@ -14,10 +14,7 @@
 
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-namespace plugin {
+namespace paddle::inference::tensorrt::plugin {
 
 inline void Serialize(void*& buffer,  // NOLINT
                       const std::vector<nvinfer1::Dims>& input_dims,
@@ -139,7 +136,4 @@ const char* TensorRTPluginCreator::getPluginNamespace() const TRT_NOEXCEPT {
   return plugin_namespace_.c_str();
 }
 
-}  // namespace plugin
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt::plugin
diff --git a/paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.cc b/paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.cc
index d4631f7057582..28161758be07f 100644
--- a/paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.cc
+++ b/paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.cc
@@ -14,9 +14,7 @@
 
 #include "paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.h"
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 
 bool PluginArgumentMappingContext::HasInput(const std::string& name) const {
   auto inputs = op_desc_->Inputs();
@@ -160,6 +158,4 @@ bool PluginArgumentMappingContext::IsForInferShape() const {
   return false;
 }
 
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
diff --git a/paddle/fluid/inference/tensorrt/test_arg_mapping_context.cc b/paddle/fluid/inference/tensorrt/test_arg_mapping_context.cc
index 85dddfea2a7c7..9bcf06cdd978c 100644
--- a/paddle/fluid/inference/tensorrt/test_arg_mapping_context.cc
+++ b/paddle/fluid/inference/tensorrt/test_arg_mapping_context.cc
@@ -17,9 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.h"
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 
 TEST(ArgMappingContextTest, BasicFunction) {
   paddle::framework::proto::OpDesc op;
@@ -123,6 +121,4 @@ TEST(ArgMappingContextTest, BasicFunction) {
   EXPECT_EQ(context.IsDenseTensorOutput("Out"), true);
 }
 
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
diff --git a/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc b/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc
index d87c9af8cfa67..ae12901e7da90 100644
--- a/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc
@@ -29,9 +29,7 @@ limitations under the License. */
 #include "paddle/phi/common/float16.h"
 
 using float16 = phi::dtype::float16;
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 
 class TensorRTDynamicShapeValueEngineTest : public ::testing::Test {
  public:
@@ -1049,6 +1047,4 @@ TEST_F(TensorRTDynamicShapeGNTest, test_trt_dynamic_shape_groupnorm) {
 }
 */
 #endif
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc
index 4c08da6d060eb..1f0b81da76ca3 100644
--- a/paddle/fluid/inference/tensorrt/test_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_engine.cc
@@ -20,9 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/platform/enforce.h"
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 
 class TensorRTEngineTest : public ::testing::Test {
  protected:
@@ -334,6 +332,4 @@ TEST_F(TensorRTEngineTest, test_pool2d) {
   ASSERT_EQ(y_cpu[1], 5.0);
 }
 
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
diff --git a/paddle/fluid/inference/utils/model_utils.cc b/paddle/fluid/inference/utils/model_utils.cc
index 27bc8b35306e1..0397793aeecfc 100644
--- a/paddle/fluid/inference/utils/model_utils.cc
+++ b/paddle/fluid/inference/utils/model_utils.cc
@@ -18,8 +18,7 @@
 #include "paddle/fluid/framework/var_type_inference.h"
 #include "paddle/phi/common/data_type.h"
 
-namespace paddle {
-namespace inference {
+namespace paddle::inference {
 
 using paddle::framework::proto::VarType;
 
@@ -70,5 +69,4 @@ phi::DataType GetModelPrecision(const framework::ProgramDesc& program) {
   return ret;
 }
 
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference
diff --git a/paddle/fluid/inference/utils/table_printer.cc b/paddle/fluid/inference/utils/table_printer.cc
index a9b6633217ad0..d182759278d10 100644
--- a/paddle/fluid/inference/utils/table_printer.cc
+++ b/paddle/fluid/inference/utils/table_printer.cc
@@ -30,8 +30,7 @@
 #include <string>
 #include <vector>
 
-namespace paddle {
-namespace inference {
+namespace paddle::inference {
 
 std::string TablePrinter::PrintTable() {
   std::stringstream ss;
@@ -211,5 +210,4 @@ void TablePrinter::AddRow(std::stringstream& ss, size_t row_idx) {
   }
 }
 
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference
diff --git a/paddle/fluid/ir_adaptor/translator/op_compat_info.cc.j2 b/paddle/fluid/ir_adaptor/translator/op_compat_info.cc.j2
index e7b7812fe61be..71c38e487c909 100644
--- a/paddle/fluid/ir_adaptor/translator/op_compat_info.cc.j2
+++ b/paddle/fluid/ir_adaptor/translator/op_compat_info.cc.j2
@@ -2,7 +2,7 @@
 
 namespace paddle {
 namespace translator {
-    
+
 OpNameNormalizer::OpNameNormalizer() {
     op_name_mappings = {
         {% for legacy_name, normalized_name in op_name_pairs.items() %}
@@ -11,35 +11,35 @@ OpNameNormalizer::OpNameNormalizer() {
     };
     op_arg_name_mappings = {
         {% for op_name, arg_name_mappings in op_arg_name_pairs.items() %}
-        { 
-            "{{op_name}}", 
+        {
+            "{{op_name}}",
             {
                 {% for normalized_name, legacy_name in arg_name_mappings.items() %}
                 { "{{normalized_name}}", "{{legacy_name}}" },
                 {% endfor %}
-            }, 
+            },
         },
         {% endfor %}
     };
     op_mutable_attributes = {
         {% for op_name, mutable_attributes in op_mutable_attributes.items() %}
-        { 
-            "{{op_name}}", 
+        {
+            "{{op_name}}",
             {
                 {% for attribute_name in mutable_attributes %}
                 "{{attribute_name}}",
                 {% endfor %}
-            }, 
+            },
         },
         {% endfor %}
     };
     op_mutable_attribute_infos = {
         {% for op_name, mutable_attribute_infos in op_mutable_attribute_infos.items() %}
-        { 
-            "{{op_name}}", 
+        {
+            "{{op_name}}",
             {
                 {% for attribute_name, attribute_info in mutable_attribute_infos.items() %}
-                { 
+                {
                     "{{attribute_name}}",
                     {
                     {% for candidate_var_name in attribute_info  %}
@@ -48,7 +48,7 @@ OpNameNormalizer::OpNameNormalizer() {
                     },
                 },
                 {% endfor %}
-            }, 
+            },
         },
         {% endfor %}
     };
diff --git a/paddle/fluid/ir_adaptor/translator/op_translator.cc b/paddle/fluid/ir_adaptor/translator/op_translator.cc
index 005f73e7b6427..8dbfe787ceb81 100644
--- a/paddle/fluid/ir_adaptor/translator/op_translator.cc
+++ b/paddle/fluid/ir_adaptor/translator/op_translator.cc
@@ -1045,7 +1045,7 @@ struct CastOpTranscriber : public OpTranscriber {
       const OpDesc& op_desc) override {
     auto& attribute_translator = AttributeTranslator::instance();
     pir::AttributeMap attribute_map = {};
-    const OpAttributeInfo info = op_attr_infos[0];
+    const OpAttributeInfo& info = op_attr_infos[0];
 
     std::string legacy_attr_name("out_dtype");
 
diff --git a/paddle/fluid/ir_adaptor/translator/program_translator.cc b/paddle/fluid/ir_adaptor/translator/program_translator.cc
index a544f89bd3b38..a16e31e13075a 100644
--- a/paddle/fluid/ir_adaptor/translator/program_translator.cc
+++ b/paddle/fluid/ir_adaptor/translator/program_translator.cc
@@ -41,8 +41,7 @@
 #include "paddle/pir/include/dialect/control_flow/ir/cf_op.h"
 #include "paddle/pir/include/dialect/control_flow/ir/cf_type.h"
 
-namespace paddle {
-namespace translator {
+namespace paddle::translator {
 
 using ProgramDesc = ::paddle::framework::ProgramDesc;
 using BlockDesc = ::paddle::framework::BlockDesc;
@@ -824,5 +823,4 @@ ProgramTranslator::VarDesc2Value() {
   return var_desc_2_value;
 }
 
-}  // namespace translator
-}  // namespace paddle
+}  // namespace paddle::translator
diff --git a/paddle/fluid/jit/function_schema.cc b/paddle/fluid/jit/function_schema.cc
index cae24962e13e2..5fcd9527c45a0 100644
--- a/paddle/fluid/jit/function_schema.cc
+++ b/paddle/fluid/jit/function_schema.cc
@@ -18,8 +18,7 @@
 #include "paddle/phi/core/enforce.h"
 
 #include "paddle/fluid/jit/function_utils.h"
-namespace paddle {
-namespace jit {
+namespace paddle::jit {
 
 Argument::Argument(const std::string& name, bool is_out)
     : name_(name), is_output_(is_out) {}
@@ -96,5 +95,4 @@ void FunctionInfo::RemoveDescFeedFetch() {
   utils::RemoveFeedFetch(program_desc_.get());
 }
 
-}  // namespace jit
-}  // namespace paddle
+}  // namespace paddle::jit
diff --git a/paddle/fluid/jit/function_utils.cc b/paddle/fluid/jit/function_utils.cc
index 519bcb2a88877..88173f8df1e3d 100644
--- a/paddle/fluid/jit/function_utils.cc
+++ b/paddle/fluid/jit/function_utils.cc
@@ -20,9 +20,7 @@
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/phi/core/enforce.h"
 
-namespace paddle {
-namespace jit {
-namespace utils {
+namespace paddle::jit::utils {
 
 std::vector<DenseTensor> ToDenseTensors(const std::vector<Tensor> &tensors) {
   std::vector<DenseTensor> ret;
@@ -111,6 +109,4 @@ void RemoveFeedFetch(framework::ProgramDesc *program_desc) {
   }
 }
 
-}  // namespace utils
-}  // namespace jit
-}  // namespace paddle
+}  // namespace paddle::jit::utils
diff --git a/paddle/fluid/jit/layer.cc b/paddle/fluid/jit/layer.cc
index c1a493db17ea9..823649686f0b9 100644
--- a/paddle/fluid/jit/layer.cc
+++ b/paddle/fluid/jit/layer.cc
@@ -23,8 +23,7 @@
 #include "paddle/fluid/jit/function.h"
 #include "paddle/fluid/jit/function_schema.h"
 
-namespace paddle {
-namespace jit {
+namespace paddle::jit {
 
 Layer::Layer(const std::shared_ptr<VariableMap>& params_map,
              const std::shared_ptr<VariableMap>& attrs_map,
@@ -104,5 +103,4 @@ std::shared_ptr<Layer> Layer::Clone(void* stream) {
   return x;
 }
 
-}  // namespace jit
-}  // namespace paddle
+}  // namespace paddle::jit
diff --git a/paddle/fluid/jit/property.cc b/paddle/fluid/jit/property.cc
index d91aba11cfb55..ddbd24d590498 100644
--- a/paddle/fluid/jit/property.cc
+++ b/paddle/fluid/jit/property.cc
@@ -23,8 +23,7 @@ limitations under the License. */
 #include "paddle/fluid/jit/property.h"
 #include "paddle/phi/core/enforce.h"
 
-namespace paddle {
-namespace jit {
+namespace paddle::jit {
 
 using Variable = paddle::framework::Variable;
 
@@ -378,5 +377,4 @@ std::vector<std::string> Property::GetStrings(const std::string &name) {
   return {};
 }
 
-}  // namespace jit
-}  // namespace paddle
+}  // namespace paddle::jit
diff --git a/paddle/fluid/jit/property.proto b/paddle/fluid/jit/property.proto
index 5f89e1da90b91..a00da9fc6e40a 100644
--- a/paddle/fluid/jit/property.proto
+++ b/paddle/fluid/jit/property.proto
@@ -84,7 +84,7 @@ message TensorProto {
   // For int64.
   // When this field is present, the data_type field MUST be INT64
   repeated int64 int64_data = 7 [packed = true];
-  
+
   // For double
   // Complex128 tensors are encoded as a single array of doubles,
   // with the real components appearing in odd numbered positions,
@@ -130,16 +130,16 @@ message ValueProto {
     STRINGS = 8;
     TENSORS = 9;
   }
-  optional string name = 1;           
-  
+  optional string name = 1;
+
   optional AttributeType type = 2;   // discriminator that indicates which field below is in use
-  
+
   // Exactly ONE of the following fields must be present
   optional float f = 3;               // float
   optional int64 i = 4;               // int
   optional bytes s = 5;               // UTF-8 string
   optional TensorProto t = 6;         // tensor value
-  
+
   repeated float floats = 7;          // list of floats
   repeated int64 ints = 8;            // list of ints
   repeated bytes strings = 9;         // list of UTF-8 strings
@@ -147,5 +147,5 @@ message ValueProto {
 }
 
 message PropertyVals {
-  repeated ValueProto entrys=1;  
+  repeated ValueProto entrys=1;
 }
diff --git a/paddle/fluid/jit/serializer_utils.cc b/paddle/fluid/jit/serializer_utils.cc
index 4fdc07f55ac74..4d22be839e4f1 100644
--- a/paddle/fluid/jit/serializer_utils.cc
+++ b/paddle/fluid/jit/serializer_utils.cc
@@ -20,9 +20,7 @@
 #include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/framework/var_desc.h"
 
-namespace paddle {
-namespace jit {
-namespace utils {
+namespace paddle::jit::utils {
 
 bool IsPersistable(framework::VarDesc* desc_ptr) {
   auto type = desc_ptr->GetType();
@@ -109,6 +107,4 @@ void InitKernelSignatureMap() {
   paddle::framework::InitDefaultKernelSignatureMap();
 }
 
-}  // namespace utils
-}  // namespace jit
-}  // namespace paddle
+}  // namespace paddle::jit::utils
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index c96730f5fda50..eef6c1a1e8c4a 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -117,9 +117,7 @@ COMMON_DECLARE_bool(use_auto_growth_pinned_allocator);
 COMMON_DECLARE_bool(use_cuda_malloc_async_allocator);
 COMMON_DECLARE_bool(auto_free_cudagraph_allocations_on_launch);
 
-namespace paddle {
-namespace memory {
-namespace allocation {
+namespace paddle::memory::allocation {
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 class CUDAGraphAllocator
@@ -1908,6 +1906,4 @@ void AllocatorFacade::SetDefaultStream(const platform::CustomPlace& place,
 UNUSED static std::shared_ptr<NaiveBestFitAllocator> unused_obj =
     std::make_shared<NaiveBestFitAllocator>(platform::CPUPlace());
 
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
+}  // namespace paddle::memory::allocation
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
index 0d4ddca4f237e..5c46376626994 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
@@ -42,9 +42,7 @@ PADDLE_DEFINE_EXPORTED_READONLY_bool(
 PADDLE_DEFINE_EXPORTED_READONLY_bool(print_allocator_trace_info,
                                      false,
                                      "print trace memory info");
-namespace paddle {
-namespace memory {
-namespace allocation {
+namespace paddle::memory::allocation {
 
 AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator(
     std::shared_ptr<Allocator> underlying_allocator,
@@ -226,6 +224,4 @@ void AutoGrowthBestFitAllocator::Trace() const {
           << " curr_chunks_num:" << chunks_.size();
 }
 
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
+}  // namespace paddle::memory::allocation
diff --git a/paddle/fluid/memory/allocation/cpu_allocator.cc b/paddle/fluid/memory/allocation/cpu_allocator.cc
index 398c015627860..426eeeae70e55 100644
--- a/paddle/fluid/memory/allocation/cpu_allocator.cc
+++ b/paddle/fluid/memory/allocation/cpu_allocator.cc
@@ -19,9 +19,7 @@
 #include "paddle/fluid/memory/stats.h"
 #include "paddle/fluid/platform/enforce.h"
 
-namespace paddle {
-namespace memory {
-namespace allocation {
+namespace paddle::memory::allocation {
 
 bool CPUAllocator::IsAllocThreadSafe() const { return true; }
 
@@ -52,6 +50,4 @@ phi::Allocation *CPUAllocator::AllocateImpl(size_t size) {
   HOST_MEMORY_STAT_UPDATE(Reserved, 0, size);
   return new Allocation(p, size, platform::CPUPlace());
 }
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
+}  // namespace paddle::memory::allocation
diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc
index 781addd7dba60..f233a5d8618eb 100644
--- a/paddle/fluid/memory/allocation/cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_allocator.cc
@@ -29,9 +29,7 @@
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 
-namespace paddle {
-namespace memory {
-namespace allocation {
+namespace paddle::memory::allocation {
 bool CUDAAllocator::IsAllocThreadSafe() const { return true; }
 void CUDAAllocator::FreeImpl(phi::Allocation* allocation) {
   PADDLE_ENFORCE_EQ(
@@ -86,6 +84,4 @@ phi::Allocation* CUDAAllocator::AllocateImpl(size_t size) {
       err_msg));
 }
 
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
+}  // namespace paddle::memory::allocation
diff --git a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc
index a1b29a193a9e8..96ed41ad27dee 100644
--- a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc
@@ -29,9 +29,7 @@
 #endif
 #if CUDA_VERSION >= 10020
 
-namespace paddle {
-namespace memory {
-namespace allocation {
+namespace paddle::memory::allocation {
 
 CUDAVirtualMemAllocator::CUDAVirtualMemAllocator(
     const platform::CUDAPlace& place)
@@ -228,8 +226,6 @@ phi::Allocation* CUDAVirtualMemAllocator::AllocateImpl(size_t size) {
       reinterpret_cast<void*>(ptr), size, platform::Place(place_));  // NOLINT
 }
 
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
+}  // namespace paddle::memory::allocation
 
 #endif
diff --git a/paddle/fluid/memory/allocation/memory_block.cc b/paddle/fluid/memory/allocation/memory_block.cc
index 26a2310c17e27..cf4407a22dd10 100644
--- a/paddle/fluid/memory/allocation/memory_block.cc
+++ b/paddle/fluid/memory/allocation/memory_block.cc
@@ -16,9 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/enforce.h"
 
-namespace paddle {
-namespace memory {
-namespace detail {
+namespace paddle::memory::detail {
 
 void MemoryBlock::Init(MetadataCache* cache,
                        Type t,
@@ -154,6 +152,4 @@ MemoryBlock* MemoryBlock::Metadata() const {
       reinterpret_cast<const MemoryBlock::Desc*>(this) - 1));
 }
 
-}  // namespace detail
-}  // namespace memory
-}  // namespace paddle
+}  // namespace paddle::memory::detail
diff --git a/paddle/fluid/memory/allocation/memory_block_desc.cc b/paddle/fluid/memory/allocation/memory_block_desc.cc
index d20d56a6d05e8..1d1f3c2396921 100644
--- a/paddle/fluid/memory/allocation/memory_block_desc.cc
+++ b/paddle/fluid/memory/allocation/memory_block_desc.cc
@@ -17,9 +17,7 @@ limitations under the License. */
 
 #include "paddle/fluid/memory/allocation/memory_block.h"
 
-namespace paddle {
-namespace memory {
-namespace detail {
+namespace paddle::memory::detail {
 
 MemoryBlock::Desc::Desc(MemoryBlock::Type t,
                         size_t i,
@@ -74,6 +72,4 @@ bool MemoryBlock::Desc::CheckGuards() const {
   return guard_begin == hash(*this, 1) && guard_end == hash(*this, 2);
 }
 
-}  // namespace detail
-}  // namespace memory
-}  // namespace paddle
+}  // namespace paddle::memory::detail
diff --git a/paddle/fluid/memory/allocation/meta_cache.cc b/paddle/fluid/memory/allocation/meta_cache.cc
index 945b0f7b89283..cca35490551a6 100644
--- a/paddle/fluid/memory/allocation/meta_cache.cc
+++ b/paddle/fluid/memory/allocation/meta_cache.cc
@@ -16,9 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/memory/allocation/memory_block.h"
 #include "paddle/fluid/platform/enforce.h"
 
-namespace paddle {
-namespace memory {
-namespace detail {
+namespace paddle::memory::detail {
 
 MetadataCache::MetadataCache(bool uses_gpu) : uses_gpu_(uses_gpu) {}
 
@@ -64,6 +62,4 @@ void MetadataCache::Invalidate(MemoryBlock* block) {
   }
 }
 
-}  // namespace detail
-}  // namespace memory
-}  // namespace paddle
+}  // namespace paddle::memory::detail
diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc
index 32853f08f94e5..fe7e722eab181 100644
--- a/paddle/fluid/memory/allocation/pinned_allocator.cc
+++ b/paddle/fluid/memory/allocation/pinned_allocator.cc
@@ -16,9 +16,7 @@
 
 #include "paddle/fluid/memory/stats.h"
 #include "paddle/fluid/platform/profiler/mem_tracing.h"
-namespace paddle {
-namespace memory {
-namespace allocation {
+namespace paddle::memory::allocation {
 bool CPUPinnedAllocator::IsAllocThreadSafe() const { return true; }
 void CPUPinnedAllocator::FreeImpl(phi::Allocation *allocation) {
 #ifdef PADDLE_WITH_HIP
@@ -49,6 +47,4 @@ phi::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) {
                            platform::TracerMemEventType::ReservedAllocate);
   return new Allocation(ptr, size, platform::CUDAPinnedPlace());
 }
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
+}  // namespace paddle::memory::allocation
diff --git a/paddle/fluid/memory/allocation/system_allocator.cc b/paddle/fluid/memory/allocation/system_allocator.cc
index a6e19b84ba8d1..903d8d85954d7 100644
--- a/paddle/fluid/memory/allocation/system_allocator.cc
+++ b/paddle/fluid/memory/allocation/system_allocator.cc
@@ -46,9 +46,7 @@ COMMON_DECLARE_double(fraction_of_gpu_memory_to_use);
 COMMON_DECLARE_uint64(initial_gpu_memory_in_mb);
 COMMON_DECLARE_uint64(reallocate_gpu_memory_in_mb);
 
-namespace paddle {
-namespace memory {
-namespace detail {
+namespace paddle::memory::detail {
 
 void* AlignedMalloc(size_t size) {
   void* p = nullptr;
@@ -348,6 +346,4 @@ void CustomAllocator::Free(void* p, size_t size, size_t index) {
 bool CustomAllocator::UseGpu() const { return true; }
 #endif
 
-}  // namespace detail
-}  // namespace memory
-}  // namespace paddle
+}  // namespace paddle::memory::detail
diff --git a/paddle/fluid/memory/stats.cc b/paddle/fluid/memory/stats.cc
index 2d66a5b6838b0..0eaf15c602224 100644
--- a/paddle/fluid/memory/stats.cc
+++ b/paddle/fluid/memory/stats.cc
@@ -22,8 +22,7 @@ PADDLE_DEFINE_EXPORTED_bool(
     log_memory_stats,
     false,
     "Log memory stats after each op runs, just used for debug.");
-namespace paddle {
-namespace memory {
+namespace paddle::memory {
 
 class StatRegistry {
  public:
@@ -173,5 +172,4 @@ int RegisterAllStats() {
 
 UNUSED static int register_all_stats = RegisterAllStats();
 
-}  // namespace memory
-}  // namespace paddle
+}  // namespace paddle::memory
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 4714f3a2eb446..fc28e02b7bdb9 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -82,7 +82,7 @@ endif()
 
 set(OP_HEADER_DEPS ${OP_HEADER_DEPS} phi common phi_utils static_prim_api get_expected_kernel_func)
 
-register_operators(EXCLUDES py_func_op generated_op1 generated_op2 generated_op3 generated_op4 load_combine_op lstm_op run_program_op quantize_linear_op
+register_operators(EXCLUDES py_func_op generated_op1 generated_op2 generated_op3 generated_op4 load_combine_op run_program_op quantize_linear_op
         save_combine_op sync_batch_norm_op activation_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS} processgroup_comm_utils)
 
 op_library(generated_op UNITY SRCS generated_op1.cc generated_op2.cc generated_op3.cc generated_op4.cc DEPS ${OP_HEADER_DEPS})
@@ -108,8 +108,6 @@ if (WITH_GPU OR WITH_ROCM)
     op_library(sync_batch_norm_op DEPS processgroup_comm_utils)
 endif()
 
-op_library(lstm_op DEPS ${OP_HEADER_DEPS})
-
 set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
 
 if (WITH_DGC)
diff --git a/paddle/fluid/operators/assign_pos_op.cc b/paddle/fluid/operators/assign_pos_op.cc
deleted file mode 100644
index 1157b3f964aaa..0000000000000
--- a/paddle/fluid/operators/assign_pos_op.cc
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle::operators {
-
-class AssignPosOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(
-        ctx->HasInput("cum_count"), "Input", "cum_count", "AssignPos");
-    OP_INOUT_CHECK(
-        ctx->HasInput("eff_num_len"), "Input", "eff_num_len", "AssignPos");
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "AssignPos");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "AssignPos");
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto cum_count_dtype =
-        OperatorWithKernel::IndicateVarDataType(ctx, "cum_count");
-    auto X_dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X");
-
-    PADDLE_ENFORCE_EQ(cum_count_dtype,
-                      X_dtype,
-                      phi::errors::InvalidArgument(
-                          "The dtype of the cum_count and X should be same"));
-    PADDLE_ENFORCE_EQ(cum_count_dtype,
-                      framework::proto::VarType::INT64,
-                      phi::errors::InvalidArgument(
-                          "The dtype of the cum_count_dtype, eff_num_len and "
-                          "X should be same as int64"));
-    return phi::KernelKey(cum_count_dtype, ctx.device_context().GetPlace());
-  }
-};
-
-class AssignPosOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "numbers to scatter.");
-    AddInput("cum_count", "The cumulative sum count of numbers.");
-    AddInput("eff_num_len",
-             "The effective numbers of numbers should be scattered.");
-    AddOutput("Out", "Assemble numbers in the order of counters.");
-
-    AddComment(R"DOC(
-assign_pos_op Operator.
-
-Assign pos decides which tokens should be fetched belong to
-specially counter orderingly.
-
-)DOC");
-  }
-};
-
-}  // namespace paddle::operators
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_WITHOUT_GRADIENT(assign_pos,
-                             ops::AssignPosOp,
-                             ops::AssignPosOpMaker);
diff --git a/paddle/fluid/operators/channel_shuffle_op.cc b/paddle/fluid/operators/channel_shuffle_op.cc
deleted file mode 100644
index 69f75691a0318..0000000000000
--- a/paddle/fluid/operators/channel_shuffle_op.cc
+++ /dev/null
@@ -1,103 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/backward.h"
-#include "paddle/phi/infermeta/unary.h"
-
-namespace paddle {
-namespace operators {
-
-class ChannelShuffleOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-};
-
-class ChannelShuffleOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor, default Tensor<float>), "
-             "the input feature data of ChannelShuffleOp, the layout is "
-             "[N, C, H, W] or [N, H, W, C].");
-    AddOutput("Out",
-              "(Tensor, default Tensor<float>), the output of "
-              "ChannelShuffleOp. The layout is also [N, C, "
-              "H, W] or [N, H, W, C].");
-    AddAttr<int>("groups", "number of groups to divide channels in.");
-    AddAttr<std::string>(
-        "data_format",
-        "An optional string from: \"NHWC\", \"NCHW\". "
-        "Defaults to \"NHWC\", Specify the data format of the input data.")
-        .SetDefault("NCHW");
-
-    AddComment(R"DOC(
-    Channel Shuffle operator
-    This operator divides channels in a tensor of shape :math:`(*, C, H, W)`
-        into :math:`g` groups and rearranges them as :math:`(*, C/g, g, H, W)`
-        while keeping the original tensor shape.
-
-    Please refer to the paper:
-        `ShuffleNet: An Extremely Efficient Convolutional Neural Network for
-        Mobile Devices <https://arxiv.org/abs/1707.01083>`_
-        by Zhang et. al (2017) for more details.
-
-        )DOC");
-  }
-};
-
-class ChannelShuffleGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-};
-
-template <typename T>
-class ChannelShuffleGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("channel_shuffle_grad");
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetAttrMap(this->Attrs());
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-DECLARE_INFER_SHAPE_FUNCTOR(channel_shuffle,
-                            ChannelShuffleInferShapeFunctor,
-                            PD_INFER_META(phi::ChannelShuffleInferMeta));
-
-REGISTER_OPERATOR(channel_shuffle,
-                  ops::ChannelShuffleOp,
-                  ops::ChannelShuffleOpMaker,
-                  ops::ChannelShuffleGradOpMaker<paddle::framework::OpDesc>,
-                  ops::ChannelShuffleGradOpMaker<paddle::imperative::OpBase>,
-                  ChannelShuffleInferShapeFunctor);
-
-DECLARE_INFER_SHAPE_FUNCTOR(channel_shuffle_grad,
-                            ChannelShuffleGradInferShapeFunctor,
-                            PD_INFER_META(phi::ChannelShuffleGradInferMeta));
-
-REGISTER_OPERATOR(channel_shuffle_grad,
-                  ops::ChannelShuffleGradOp,
-                  ChannelShuffleGradInferShapeFunctor);
diff --git a/paddle/fluid/operators/collective/c_allreduce_avg_op.cc b/paddle/fluid/operators/collective/c_allreduce_avg_op.cc
index 963ea26321bdb..13d07557f1e7c 100644
--- a/paddle/fluid/operators/collective/c_allreduce_avg_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_avg_op.cc
@@ -14,17 +14,14 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 class OpDesc;
-}  // namespace framework
-namespace imperative {
+}  // namespace paddle::framework
+namespace paddle::imperative {
 class OpBase;
-}  // namespace imperative
-}  // namespace paddle
+}  // namespace paddle::imperative
 
-namespace paddle {
-namespace operators {
+namespace paddle::operators {
 
 class CAllReduceAvgOpMaker : public CAllReduceOpMaker {
  protected:
@@ -33,8 +30,7 @@ class CAllReduceAvgOpMaker : public CAllReduceOpMaker {
 
 DECLARE_INPLACE_OP_INFERER(AllreduceAvgInplaceInferer, {"X", "Out"});
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace paddle::operators
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op.cc b/paddle/fluid/operators/collective/c_allreduce_max_op.cc
index ab174de1cec3c..c496ad8955e7c 100644
--- a/paddle/fluid/operators/collective/c_allreduce_max_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op.cc
@@ -14,19 +14,16 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 class OpDesc;
 template <typename T>
 class EmptyGradOpMaker;
-}  // namespace framework
-namespace imperative {
+}  // namespace paddle::framework
+namespace paddle::imperative {
 class OpBase;
-}  // namespace imperative
-}  // namespace paddle
+}  // namespace paddle::imperative
 
-namespace paddle {
-namespace operators {
+namespace paddle::operators {
 
 class CAllReduceMaxOpMaker : public CAllReduceOpMaker {
  protected:
@@ -37,8 +34,7 @@ DECLARE_INPLACE_OP_INFERER(AllreduceMaxInplaceInferer, {"X", "Out"});
 
 DEFINE_C_ALLREDUCE_CPU_KERNEL(CAllReduceMax, kRedMax)
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace paddle::operators
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op.cc
index b9bcc0174b03f..ad9fbda3dafeb 100644
--- a/paddle/fluid/operators/collective/c_allreduce_prod_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op.cc
@@ -14,19 +14,16 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 class OpDesc;
 template <typename T>
 class EmptyGradOpMaker;
-}  // namespace framework
-namespace imperative {
+}  // namespace paddle::framework
+namespace paddle::imperative {
 class OpBase;
-}  // namespace imperative
-}  // namespace paddle
+}  // namespace paddle::imperative
 
-namespace paddle {
-namespace operators {
+namespace paddle::operators {
 
 class CAllReduceProdOpMaker : public CAllReduceOpMaker {
  protected:
@@ -37,8 +34,7 @@ DECLARE_INPLACE_OP_INFERER(AllreduceProdInplaceInferer, {"X", "Out"});
 
 DEFINE_C_ALLREDUCE_CPU_KERNEL(CAllReduceProd, kRedProd)
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace paddle::operators
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/fluid/operators/collective/c_comm_init_all_op.cc b/paddle/fluid/operators/collective/c_comm_init_all_op.cc
index 3a6156eb96e71..21729fd438b19 100644
--- a/paddle/fluid/operators/collective/c_comm_init_all_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_all_op.cc
@@ -14,8 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_comm_init_all_op.h"
 
-namespace paddle {
-namespace operators {
+namespace paddle::operators {
 
 class CCommInitAllOp : public framework::OperatorWithKernel {
  public:
@@ -47,8 +46,7 @@ Initialize all collective communication context
   }
 };
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace paddle::operators
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/fluid/operators/collective/c_comm_init_op.cc b/paddle/fluid/operators/collective/c_comm_init_op.cc
index 768c60c27b093..41c31b8e7b5cb 100644
--- a/paddle/fluid/operators/collective/c_comm_init_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_op.cc
@@ -42,14 +42,11 @@ COMMON_DECLARE_bool(dynamic_static_unified_comm);
 #include "paddle/phi/core/distributed/store/store_utils.h"
 #include "paddle/phi/core/distributed/store/tcp_store.h"
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 class Scope;
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
 
-namespace paddle {
-namespace operators {
+namespace paddle::operators {
 
 class CCommInitOp : public framework::OperatorBase {
  public:
@@ -183,8 +180,7 @@ Initialize collective communication context within this trainer
   }
 };
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace paddle::operators
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/fluid/operators/collective/c_gen_xccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_xccl_id_op.cc
index 2241ec81019d1..0da7899638880 100644
--- a/paddle/fluid/operators/collective/c_gen_xccl_id_op.cc
+++ b/paddle/fluid/operators/collective/c_gen_xccl_id_op.cc
@@ -24,8 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/backends/device_manager.h"
 
-namespace paddle {
-namespace operators {
+namespace paddle::operators {
 
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
 static void CopyXCCLIDToVar(const std::vector<phi::ccl::CCLRootId>& xccl_ids,
@@ -97,8 +96,7 @@ For trainer 1~n: start a gRPC server to get the UniqueId, once got, stop the ser
   }
 };
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace paddle::operators
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/fluid/operators/collective/c_reduce_max_op.cc b/paddle/fluid/operators/collective/c_reduce_max_op.cc
index 569b9733aa6a1..dcca25ef76fdb 100644
--- a/paddle/fluid/operators/collective/c_reduce_max_op.cc
+++ b/paddle/fluid/operators/collective/c_reduce_max_op.cc
@@ -14,19 +14,16 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_reduce_op.h"
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 class OpDesc;
 template <typename T>
 class EmptyGradOpMaker;
-}  // namespace framework
-namespace imperative {
+}  // namespace paddle::framework
+namespace paddle::imperative {
 class OpBase;
-}  // namespace imperative
-}  // namespace paddle
+}  // namespace paddle::imperative
 
-namespace paddle {
-namespace operators {
+namespace paddle::operators {
 
 class CReduceMaxOpMaker : public CReduceOpMaker {
  protected:
@@ -35,8 +32,7 @@ class CReduceMaxOpMaker : public CReduceOpMaker {
 
 DEFINE_C_REDUCE_CPU_KERNEL(CReduceMax, kRedMax)
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace paddle::operators
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op.cc b/paddle/fluid/operators/collective/c_reducescatter_op.cc
index bdde5115d2e4b..11965ca4dd21f 100644
--- a/paddle/fluid/operators/collective/c_reducescatter_op.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op.cc
@@ -16,8 +16,7 @@ limitations under the License. */
 
 #include <memory>
 
-namespace paddle {
-namespace operators {
+namespace paddle::operators {
 
 class CReduceScatterOp : public framework::OperatorWithKernel {
  public:
@@ -63,8 +62,7 @@ Reference: https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/us
   }
 };
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace paddle::operators
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/fluid/operators/collective/global_scatter_op.cc b/paddle/fluid/operators/collective/global_scatter_op.cc
index e54c70d0d8db6..a91deb2f11d85 100644
--- a/paddle/fluid/operators/collective/global_scatter_op.cc
+++ b/paddle/fluid/operators/collective/global_scatter_op.cc
@@ -14,8 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/global_scatter_op.h"
 
-namespace paddle {
-namespace operators {
+namespace paddle::operators {
 
 class GlobalScatterOp : public framework::OperatorWithKernel {
  public:
@@ -104,8 +103,7 @@ class GlobalScatterOpGradMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace paddle::operators
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/fluid/operators/collective/mp_allreduce_sum_op.cc b/paddle/fluid/operators/collective/mp_allreduce_sum_op.cc
index 283826a5a31fc..f56d680afbb23 100644
--- a/paddle/fluid/operators/collective/mp_allreduce_sum_op.cc
+++ b/paddle/fluid/operators/collective/mp_allreduce_sum_op.cc
@@ -15,17 +15,14 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 class OpDesc;
-}  // namespace framework
-namespace imperative {
+}  // namespace paddle::framework
+namespace paddle::imperative {
 class OpBase;
-}  // namespace imperative
-}  // namespace paddle
+}  // namespace paddle::imperative
 
-namespace paddle {
-namespace operators {
+namespace paddle::operators {
 
 class MpAllReduceSumOp : public framework::OperatorWithKernel {
  public:
@@ -75,8 +72,7 @@ DECLARE_INPLACE_OP_INFERER(MpAllReduceSumInplaceInferer, {"X", "Out"});
 
 DEFINE_C_ALLREDUCE_CPU_KERNEL(MpAllReduceSum, kRedSum);
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace paddle::operators
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/fluid/operators/collective/partial_send_op.cc b/paddle/fluid/operators/collective/partial_send_op.cc
index cf2a0ece1a7ab..961b8c4cf1382 100644
--- a/paddle/fluid/operators/collective/partial_send_op.cc
+++ b/paddle/fluid/operators/collective/partial_send_op.cc
@@ -14,8 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/partial_send_op.h"
 
-namespace paddle {
-namespace operators {
+namespace paddle::operators {
 
 class PartialSendOp : public framework::OperatorWithKernel {
  public:
@@ -84,8 +83,7 @@ Reference: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/p2p.h
   }
 };
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace paddle::operators
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/fluid/operators/collective/send_v2_op.cc b/paddle/fluid/operators/collective/send_v2_op.cc
index cc41558804d6f..067488404a0b9 100644
--- a/paddle/fluid/operators/collective/send_v2_op.cc
+++ b/paddle/fluid/operators/collective/send_v2_op.cc
@@ -14,8 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/send_v2_op.h"
 
-namespace paddle {
-namespace operators {
+namespace paddle::operators {
 
 class SendOpV2 : public framework::OperatorWithKernel {
  public:
@@ -78,8 +77,7 @@ Reference: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/p2p.h
   }
 };
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace paddle::operators
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/fluid/operators/common_infer_shape_functions.cc b/paddle/fluid/operators/common_infer_shape_functions.cc
index a7f7724a4be57..6b492aa9c5225 100644
--- a/paddle/fluid/operators/common_infer_shape_functions.cc
+++ b/paddle/fluid/operators/common_infer_shape_functions.cc
@@ -14,18 +14,14 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/common_infer_shape_functions.h"
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 class InferShapeContext;
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
 
 // This file almostly contains all the infershape functions that are used in
 // operators.
 
-namespace paddle {
-namespace operators {
-namespace details {
+namespace paddle::operators::details {
 
 inline void GetBroadcastDimsArrays(const phi::DDim &x_dims,
                                    const phi::DDim &y_dims,
@@ -105,7 +101,8 @@ phi::DDim BroadcastTwoDims(const phi::DDim &x_dims,
   return common::make_ddim(out_dims_array);
 }
 
-}  // namespace details
+}  // namespace paddle::operators::details
+namespace paddle::operators {
 
 // shape input(0) -> output(0) without change.
 void UnaryOpUnchangedInferShape(framework::InferShapeContext *ctx) {
@@ -196,5 +193,4 @@ void BinaryOpBroadcastInferShape(framework::InferShapeContext *ctx) {
   }
 }
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace paddle::operators
diff --git a/paddle/fluid/operators/compat/conv2d.pbtxt b/paddle/fluid/operators/compat/conv2d.pbtxt
index b18e026499243..1b602fe43aab1 100644
--- a/paddle/fluid/operators/compat/conv2d.pbtxt
+++ b/paddle/fluid/operators/compat/conv2d.pbtxt
@@ -50,7 +50,7 @@ extra {
   attrs {
     name: "quantization_type"
     type: STRING
-  } 
+  }
   attrs {
     name: "bit_length"
     type: INT
diff --git a/paddle/fluid/operators/compat/conv2d_transpose.pbtxt b/paddle/fluid/operators/compat/conv2d_transpose.pbtxt
index c805547e0143d..ed04ecc4b71ec 100644
--- a/paddle/fluid/operators/compat/conv2d_transpose.pbtxt
+++ b/paddle/fluid/operators/compat/conv2d_transpose.pbtxt
@@ -8,7 +8,7 @@ def {
   }
   inputs {
     name: "Bias"
-  }  
+  }
   outputs {
     name: "Output"
   }
diff --git a/paddle/fluid/operators/compat/conv2d_transpose_bias.pbtxt b/paddle/fluid/operators/compat/conv2d_transpose_bias.pbtxt
index bce4fc9f0e114..93bf29b8b394a 100644
--- a/paddle/fluid/operators/compat/conv2d_transpose_bias.pbtxt
+++ b/paddle/fluid/operators/compat/conv2d_transpose_bias.pbtxt
@@ -8,7 +8,7 @@ def {
   }
   inputs {
     name: "Bias"
-  }  
+  }
   outputs {
     name: "Output"
   }
diff --git a/paddle/fluid/operators/compat/depthwise_conv2d.pbtxt b/paddle/fluid/operators/compat/depthwise_conv2d.pbtxt
index ee04cd73dd70c..a0d80211c2594 100644
--- a/paddle/fluid/operators/compat/depthwise_conv2d.pbtxt
+++ b/paddle/fluid/operators/compat/depthwise_conv2d.pbtxt
@@ -42,7 +42,7 @@ extra {
   attrs {
     name: "quantization_type"
     type: STRING
-  } 
+  }
   attrs {
     name: "bit_length"
     type: INT
diff --git a/paddle/fluid/operators/compat/fused_transpose.pbtxt b/paddle/fluid/operators/compat/fused_transpose.pbtxt
index e4c7c218cc117..677d2e5792f75 100644
--- a/paddle/fluid/operators/compat/fused_transpose.pbtxt
+++ b/paddle/fluid/operators/compat/fused_transpose.pbtxt
@@ -17,26 +17,26 @@ def {
 extra {
   attrs{
     name: "fused_squeeze2_axes"
-    type: INTS        
+    type: INTS
   }
   attrs{
     name: "fused_unsqueeze2_axes"
-    type: INTS        
+    type: INTS
   }
   attrs{
     name: "fused_reshape2_shape"
-    type: INTS        
+    type: INTS
   }
   attrs{
     name: "scale"
-    type: FLOAT        
+    type: FLOAT
   }
   attrs{
     name: "shift"
-    type: FLOAT        
+    type: FLOAT
   }
   attrs{
     name: "output_data_type"
-    type: STRING        
+    type: STRING
   }
 }
diff --git a/paddle/fluid/operators/compat/mul.pbtxt b/paddle/fluid/operators/compat/mul.pbtxt
index 056f799c6c49c..28b40d0e6526c 100644
--- a/paddle/fluid/operators/compat/mul.pbtxt
+++ b/paddle/fluid/operators/compat/mul.pbtxt
@@ -22,7 +22,7 @@ extra {
   attrs {
     name: "Out0_threshold"
     type: FLOAT
-  } 
+  }
   attrs {
     name: "bit_length"
     type: INT
@@ -30,7 +30,7 @@ extra {
   attrs {
     name: "quantization_type"
     type: STRING
-  } 
+  }
   attrs {
     name: "skip_quant"
     type: BOOLEAN
diff --git a/paddle/fluid/operators/compat/sequence_conv.pbtxt b/paddle/fluid/operators/compat/sequence_conv.pbtxt
index c5335a25c557a..679b1095a57ba 100644
--- a/paddle/fluid/operators/compat/sequence_conv.pbtxt
+++ b/paddle/fluid/operators/compat/sequence_conv.pbtxt
@@ -23,7 +23,7 @@ def {
   attrs {
     name: "contextStride"
     type: INT
-   }  
+   }
 }
 extra {
   attrs {
@@ -49,5 +49,5 @@ extra {
   attrs {
     name: "op_device"
     type: STRING
-  }   
+  }
 }
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.cc b/paddle/fluid/operators/controlflow/conditional_block_op.cc
index 0cc5a0bbd0927..5d4b4b21ccdcc 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op.cc
+++ b/paddle/fluid/operators/controlflow/conditional_block_op.cc
@@ -25,8 +25,7 @@ limitations under the License. */
 
 COMMON_DECLARE_bool(use_mkldnn);
 
-namespace paddle {
-namespace operators {
+namespace paddle::operators {
 
 const char ConditionalOp::kInputs[] = "Input";        // NOLINT
 const char ConditionalOp::kOutputs[] = "Out";         // NOLINT
@@ -334,8 +333,7 @@ class ConditionalBlockGradMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace paddle::operators
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(conditional_block,
diff --git a/paddle/fluid/operators/controlflow/pylayer_op.cc b/paddle/fluid/operators/controlflow/pylayer_op.cc
index 0a6cb8aac83c0..872028bead1c9 100644
--- a/paddle/fluid/operators/controlflow/pylayer_op.cc
+++ b/paddle/fluid/operators/controlflow/pylayer_op.cc
@@ -19,8 +19,7 @@
 #include "paddle/fluid/operators/controlflow/control_flow_op_helper.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-namespace paddle {
-namespace operators {
+namespace paddle::operators {
 
 namespace {  // NOLINT
 enum class PyLayerBlockIndex { kFORWARD = 0, kBACKWARD = 1, kNONE = 2 };
@@ -263,8 +262,7 @@ class PyLayerBackwardInferVarType : public framework::VarTypeInference {
   }
 };
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace paddle::operators
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(pylayer,
diff --git a/paddle/fluid/operators/controlflow/pylayer_op_helper.cc b/paddle/fluid/operators/controlflow/pylayer_op_helper.cc
index bdd669c644e6e..68263cec46282 100644
--- a/paddle/fluid/operators/controlflow/pylayer_op_helper.cc
+++ b/paddle/fluid/operators/controlflow/pylayer_op_helper.cc
@@ -16,14 +16,11 @@
 
 #include <string>
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 class ProgramDesc;
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
 
-namespace paddle {
-namespace operators {
+namespace paddle::operators {
 
 static bool IsMatchedPyLayerOpAndPyLayerGradOp(const OpVariant &fwd_op,
                                                const OpVariant &bwd_op) {
@@ -173,5 +170,4 @@ void PrepareSafeEagerDeletionOnPyLayerOpAndPyLayerGradOp(
       program, &fwd_ops, &bwd_ops);
 }
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace paddle::operators
diff --git a/paddle/fluid/operators/custom_device_common_op_registry.cc b/paddle/fluid/operators/custom_device_common_op_registry.cc
index de04cb0e3bba5..ffdb3f01454a2 100644
--- a/paddle/fluid/operators/custom_device_common_op_registry.cc
+++ b/paddle/fluid/operators/custom_device_common_op_registry.cc
@@ -1366,7 +1366,10 @@ void RegisterCustomDeviceCommonKernel(const std::string& dev_type) {
           float>,
       paddle::operators::CConcatOpCustomDeviceKernel<
           paddle::platform::CustomDeviceContext,
-          phi::dtype::float16>);
+          phi::dtype::float16>,
+      paddle::operators::CConcatOpCustomDeviceKernel<
+          paddle::platform::CustomDeviceContext,
+          phi::dtype::bfloat16>);
   REGISTER_OP_CUSTOM_DEVICE_KERNEL(
       c_split,
       device_type,
@@ -1378,7 +1381,10 @@ void RegisterCustomDeviceCommonKernel(const std::string& dev_type) {
           int>,
       paddle::operators::CSplitOpCustomDeviceKernel<
           paddle::platform::CustomDeviceContext,
-          phi::dtype::float16>);
+          phi::dtype::float16>,
+      paddle::operators::CSplitOpCustomDeviceKernel<
+          paddle::platform::CustomDeviceContext,
+          phi::dtype::bfloat16>);
   REGISTER_OP_CUSTOM_DEVICE_KERNEL(
       c_embedding,
       device_type,
diff --git a/paddle/fluid/operators/detection/bipartite_match_op.cc b/paddle/fluid/operators/detection/bipartite_match_op.cc
index c3880559f737f..8fdf5326a29c9 100644
--- a/paddle/fluid/operators/detection/bipartite_match_op.cc
+++ b/paddle/fluid/operators/detection/bipartite_match_op.cc
@@ -15,8 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-namespace paddle {
-namespace operators {
+namespace paddle::operators {
 
 class BipartiteMatchOp : public framework::OperatorWithKernel {
  public:
@@ -308,8 +307,7 @@ If Tensor, the height of ColToRowMatchIndices is 1.
   }
 };
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace paddle::operators
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(
diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
index d9bb602338352..c15c161a60999 100755
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -6,8 +6,6 @@ endif()
 register_operators(
   EXCLUDES
   fused_bn_activation_op
-  fusion_group_op
-  fusion_lstm_op
   fused_bn_add_activation_op
   fused_attention_op
   fused_transformer_op
@@ -19,8 +17,6 @@ register_operators(
   fused_gate_attention_op
   resnet_basic_block_op)
 
-op_library(fusion_lstm_op)
-
 if(WITH_XPU)
   op_library(resnet_basic_block_op)
   op_library(resnet_unit_op)
@@ -38,10 +34,6 @@ if(WITH_GPU OR WITH_ROCM)
   # HIP not support cudnnTransformTensor
   # HIP not support cudnnConvolutionBiasActivationForward
   op_library(fused_gate_attention_op)
-  # fusion_group
-  if(NOT APPLE AND NOT WIN32)
-    op_library(fusion_group_op)
-  endif()
   # fused_bn_add_activation
   # HIP not support bn act fuse in MIOPEN
   if((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 7401))
diff --git a/paddle/fluid/operators/fused/fused_conv2d_op.cc b/paddle/fluid/operators/fused/fused_conv2d_op.cc
index 4e440bc972fbd..04d2d4043bf96 100644
--- a/paddle/fluid/operators/fused/fused_conv2d_op.cc
+++ b/paddle/fluid/operators/fused/fused_conv2d_op.cc
@@ -20,8 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/generator/get_expected_kernel_func.h"
 #include "paddle/phi/infermeta/multiary.h"
 
-namespace paddle {
-namespace operators {
+namespace paddle::operators {
 
 class FusedConvOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
@@ -118,8 +117,7 @@ class FusedConvOp : public framework::OperatorWithKernel {
   }
 };
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace paddle::operators
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc
deleted file mode 100644
index 3ded78e3be4cf..0000000000000
--- a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc
+++ /dev/null
@@ -1,500 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/fused/fused_elemwise_activation_op.h"
-
-#include <memory>
-#include <unordered_set>
-
-namespace paddle {
-namespace operators {
-
-bool IsUnaryCompound(const std::vector<std::string> &functor_list) {
-  PADDLE_ENFORCE_EQ(
-      functor_list.size(),
-      2,
-      phi::errors::InvalidArgument(
-          "Invalid functor list size %d, which should be equal to %d.",
-          functor_list.size(),
-          2));
-  static std::unordered_set<std::string> binary_fun = {"elementwise_add",
-                                                       "elementwise_mul",
-                                                       "elementwise_add_grad",
-                                                       "elementwise_mul_grad"};
-  return binary_fun.count(functor_list[1]) != 0;
-}
-
-bool HasInPlaceUnary(const std::vector<std::string> &functor_list) {
-  PADDLE_ENFORCE_EQ(
-      functor_list.size(),
-      2,
-      phi::errors::InvalidArgument(
-          "Invalid functor list size %d, which should be equal to %d.",
-          functor_list.size(),
-          2));
-  static std::unordered_set<std::string> InplaceOpSet = {"relu", "relu_grad"};
-  bool is_in_place = false;
-  for (auto &func_name : functor_list) {
-    is_in_place |= (InplaceOpSet.count(func_name) == 1);
-  }
-  return is_in_place;
-}
-
-bool InputXCanBeAbsent(const std::vector<std::string> &functor_list) {
-  PADDLE_ENFORCE_EQ(
-      functor_list.size(),
-      2,
-      phi::errors::InvalidArgument(
-          "Invalid functor list size %d, which should be equal to %d.",
-          functor_list.size(),
-          2));
-  static std::unordered_set<std::string> binary_fun = {"elementwise_add_grad"};
-  return binary_fun.count(functor_list[0]) != 0 ||
-         binary_fun.count(functor_list[1]) != 0;
-}
-
-/*
- * Whether the compound function is supported.
- * For Unary(Binary(X, Y)), the intermediate_out's shape is the same the final
- * out.
- */
-static bool IsSupportedCompound(const std::vector<std::string> &functors) {
-  PADDLE_ENFORCE_EQ(
-      functors.size(),
-      2UL,
-      phi::errors::InvalidArgument(
-          "Invalid functor list size %d, which should be equal to %d.",
-          functors.size(),
-          2));
-
-  static std::unordered_set<std::string> unary_fun = {
-      "scale", "relu", "tanh", "sigmoid", "gelu"};
-  static std::unordered_set<std::string> binary_fun = {"elementwise_add",
-                                                       "elementwise_mul"};
-
-  std::string unary_fun_str;
-  if (binary_fun.count(functors[0])) {
-    unary_fun_str = functors[1];
-  } else if (binary_fun.count(functors[1])) {
-    unary_fun_str = functors[0];
-  } else {
-    PADDLE_THROW(phi::errors::InvalidArgument(
-        "%s and %s are not included in fused_list.", functors[0], functors[1]));
-  }
-  PADDLE_ENFORCE_EQ(unary_fun.count(unary_fun_str),
-                    1,
-                    phi::errors::InvalidArgument(
-                        "%s is not included in fused_list.", unary_fun_str));
-  return true;
-}
-
-class FusedElemwiseActivationOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("X"),
-        true,
-        phi::errors::InvalidArgument(
-            "Input(X) of FusedElemwiseActivationOp op should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Y"),
-        true,
-        phi::errors::InvalidArgument(
-            "Input(Y) of FusedElemwiseActivationOp op should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("Out"),
-        true,
-        phi::errors::InvalidArgument(
-            "Output(Out) of FusedElemwiseActivationOp op should not be null."));
-
-    auto x_dim = ctx->GetInputDim("X");
-    auto y_dim = ctx->GetInputDim("Y");
-
-    // Whether the shape of Y is a continuous subsequence of X,
-    // For more information please refer to the op's introduction.
-    bool bcast_y = IsBcastY(x_dim, y_dim);
-
-    auto &out_dim = bcast_y ? x_dim : y_dim;
-    std::string out_lod = bcast_y ? "X" : "Y";
-
-    if (ctx->Attrs().Get<bool>("save_intermediate_out")) {
-      PADDLE_ENFORCE_EQ(
-          ctx->HasOutput("IntermediateOut"),
-          true,
-          phi::errors::InvalidArgument(
-              "Output(IntermediateOut) of FusedElemwiseActivationOp "
-              "should not be null."));
-
-      if (IsUnaryCompound(
-              ctx->Attrs().Get<std::vector<std::string>>("functor_list"))) {
-        // for Unary(Binary(X, Y)), the shape and lod of out and
-        // intermediate_out are the same.
-        ctx->SetOutputDim("IntermediateOut", out_dim);
-        // set the lod of intermediate_out
-        ctx->ShareLoD(out_lod, /*->*/ "IntermediateOut");
-      } else {
-        // for Binary(X, Unary(Y)), the shape and lod of Y and
-        // intermediate_out are the same.
-        ctx->SetOutputDim("IntermediateOut", y_dim);
-        // set the lod of intermediate_out
-        ctx->ShareLoD("Y", /*->*/ "IntermediateOut");
-      }
-    }
-    ctx->SetOutputDim("Out", out_dim);
-    ctx->ShareLoD(out_lod, /*->*/ "Out");
-  }
-
-  static bool IsBcastY(const phi::DDim &x_dim, const phi::DDim &y_dim) {
-    bool bcast_y = x_dim.size() >= y_dim.size();
-    if (x_dim.size() == y_dim.size()) {
-      for (int i = 0; i < x_dim.size(); ++i) {
-        if (x_dim[i] < y_dim[i]) {
-          bcast_y = false;
-          break;
-        }
-      }
-    }
-    return bcast_y;
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx.Input<phi::DenseTensor>("X")->dtype(),
-                      ctx.Input<phi::DenseTensor>("Y")->dtype(),
-                      phi::errors::InvalidArgument(
-                          "The element's type of input should be the same."));
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          ctx.GetPlace());
-  }
-};
-
-class FusedElemwiseActivationMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(
-        "X",
-        "(Tensor) The input tensor of fused_elemwise_activation operator.");
-    AddInput(
-        "Y",
-        "(Tensor) The input tensor of fused_elemwise_activation operator.");
-    AddOutput("Out",
-              "vector<Tensor> The output tensor of fused_elemwise_activation "
-              "operator.");
-    AddOutput("IntermediateOut",
-              "Tensor The IntermediateOut tensor of fused_elemwise_activation "
-              "operator.")
-        .AsIntermediate();
-    AddAttr<int>("axis",
-                 "axis is used by elementwise_op, the default value is -1.")
-        .SetDefault(-1);
-    AddAttr<float>("scale",
-                   "scale is used by scale_op, the default value is 0.0.")
-        .SetDefault(0.0);
-    AddAttr<bool>("save_intermediate_out",
-                  "Whether to save the intermediate_out.")
-        .SetDefault(false);
-    AddAttr<std::vector<std::string>>("functor_list",
-                                      "The functors that should be fused.")
-        .AddCustomChecker([&](const std::vector<std::string> &functor_list) {
-          PADDLE_ENFORCE_EQ(
-              IsSupportedCompound(functor_list),
-              true,
-              phi::errors::InvalidArgument(
-                  "the input functors should support compounding."));
-        });
-
-    AddComment(R"DOC(
-FusedElemwiseActivation Operator.
-
-At present, FusedElemwiseActivation only supports Two kinds of compound
-operators (elementwise_op and activation_op):
-
-    Z = Binary(X, Unary(Y))
-    Z = Unary(Binary(X, Y))
-
-There are two cases for this operator:
-
-1. The shape of $Y$ and $X$ is the same.
-2. The shape of $Y$ is a continuous subsequence of $X$ or the shape of $X$ is a continuous subsequence of $Y$.
-
-For case 2 (assume that the shape of $Y$ is a continuous subsequence of $X$ ):
-
-1. Broadcast $Y$ to match the shape of $X$, where $axis$ is the start dimension index
-   for broadcasting $Y$ onto $X$.
-2. If $axis$ is -1 (default), $axis = rank(X) - rank(Y)$.
-3. The trailing dimensions of size 1 for $Y$ will be ignored for the consideration of
-   subsequence, such as shape(Y) = (2, 1) => (2).
-
-For example:
-
-    .. code-block:: text
-
-        shape(X) = (2, 3, 4, 5), shape(Y) = (,)
-        shape(X) = (2, 3, 4, 5), shape(Y) = (5,)
-        shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5), with axis=-1(default) or axis=2
-        shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
-        shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
-        shape(X) = (2, 3, 4, 5), shape(Y) = (2, 1), with axis=0
-
-
-The inputs $X$ and $Y$ can carry the different LoD information.
-But the output only shares the LoD information with the one whose shape is the same with Out.
-The attributions of activation_op can be get from fused_elemwise_activation_op's.
-The functor_list records the functions to be fused, for example
-["scale", "elementwise_add"].
-
-)DOC");
-  }
-};
-
-template <typename T>
-class FusedElemwiseActivationGradMaker
-    : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> grad_op) const override {
-    grad_op->SetType(this->ForwardOpType() + "_grad");
-
-    for (auto &input_param : this->InputNames()) {
-      grad_op->SetInput(input_param, this->Input(input_param));
-      grad_op->SetOutput(framework::GradVarName(input_param),
-                         this->InputGrad(input_param, true));
-    }
-
-    grad_op->SetInput("Out", this->Output("Out"));
-    grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-
-    grad_op->SetAttrMap(this->Attrs());
-
-    std::vector<std::string> functor_names = PADDLE_GET_CONST(
-        std::vector<std::string>, grad_op->GetAttr("functor_list"));
-
-    functor_names[0] += "_grad";
-    functor_names[1] += "_grad";
-    grad_op->SetAttr("functor_list", functor_names);
-
-    if (PADDLE_GET_CONST(bool, grad_op->GetAttr("save_intermediate_out"))) {
-      // PADDLE_ENFORCE_NE(Output("IntermediateOut").size(), 0);
-      grad_op->SetInput("IntermediateOut", this->Output("IntermediateOut"));
-      grad_op->SetOutput(framework::GradVarName("IntermediateOut"),
-                         this->OutputGrad("IntermediateOut"));
-    } else {
-      grad_op->SetInput("IntermediateOut", this->EmptyOutput());
-      grad_op->SetOutput(framework::GradVarName("IntermediateOut"),
-                         this->EmptyOutputGrad());
-    }
-  }
-};
-
-class FusedElemwiseAddActivationMaker : public FusedElemwiseActivationMaker {};
-
-template <typename T>
-class FusedElemwiseAddActivationGradMaker
-    : public FusedElemwiseActivationGradMaker<T> {
- public:
-  using FusedElemwiseActivationGradMaker<T>::FusedElemwiseActivationGradMaker;
-};
-
-class FusedElemwiseActivationOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput(framework::GradVarName("Out")),
-        true,
-        phi::errors::InvalidArgument("Input(Out@Grad) should not be null."));
-
-    auto functor_list =
-        ctx->Attrs().Get<std::vector<std::string>>("functor_list");
-
-    if (ctx->Attrs().Get<bool>("save_intermediate_out")) {
-      PADDLE_ENFORCE_EQ(ctx->HasInput("IntermediateOut"),
-                        true,
-                        phi::errors::InvalidArgument(
-                            "Input(IntermediateOut) should not be null."));
-    } else {
-      if (!InputXCanBeAbsent(functor_list)) {
-        PADDLE_ENFORCE_EQ(
-            ctx->HasInput("X"),
-            true,
-            phi::errors::InvalidArgument("Input(X) should not be null."));
-      }
-    }
-
-    auto x_grad_name = framework::GradVarName("X");
-    auto y_grad_name = framework::GradVarName("Y");
-    auto inter_grad_name = framework::GradVarName("IntermediateOut");
-
-    if (ctx->HasOutput(x_grad_name)) {
-      if (ctx->HasInputs("X")) {
-        ctx->SetOutputDim(x_grad_name, ctx->GetInputDim("X"));
-        ctx->ShareLoD("X", x_grad_name);
-      } else {
-        // Currently, only when Binary is elementwise_add or elementwise_sub,
-        // the "X" could be absent.
-        PADDLE_ENFORCE_EQ(
-            InputXCanBeAbsent(functor_list),
-            true,
-            phi::errors::InvalidArgument(
-                "Only when BinaryFunctor is elementwise_add, the 'X' "
-                "could be absent."));
-
-        // Node: If "X" is absence, the shape of Y should be a continuous
-        // subsequence of X, otherwise, we could not infer the shape of dx.
-
-        ctx->SetOutputDim(x_grad_name,
-                          ctx->GetInputDim(framework::GradVarName("Out")));
-        ctx->ShareLoD(framework::GradVarName("Out"), x_grad_name);
-      }
-    }
-
-    if (ctx->HasOutput(y_grad_name)) {
-      PADDLE_ENFORCE_EQ(
-          ctx->HasInput("Y"),
-          true,
-          phi::errors::InvalidArgument("Input(Y) should not be null."));
-      ctx->SetOutputDim(y_grad_name, ctx->GetInputDim("Y"));
-      ctx->ShareLoD("Y", y_grad_name);
-    }
-
-    if (ctx->HasOutput(inter_grad_name)) {
-      // For Unary(Binary(X, Y)), IntermediateOut should not be empty.
-      if (IsUnaryCompound(functor_list)) {
-        ctx->SetOutputDim(inter_grad_name,
-                          ctx->GetInputDim(framework::GradVarName("Out")));
-        ctx->ShareLoD(framework::GradVarName("Out"), inter_grad_name);
-      } else {
-        ctx->SetOutputDim(inter_grad_name, ctx->GetInputDim("Y"));
-        ctx->ShareLoD("Y", inter_grad_name);
-      }
-    }
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(
-                              ctx, framework::GradVarName("Out")),
-                          ctx.GetPlace());
-  }
-};
-
-class FusedElemwiseAddActivationOp : public FusedElemwiseActivationOp {
- public:
-  using FusedElemwiseActivationOp::FusedElemwiseActivationOp;
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    FusedElemwiseActivationOp::InferShape(ctx);
-    std::vector<std::string> functor_names =
-        ctx->Attrs().Get<std::vector<std::string>>("functor_list");
-    bool elemntwise_add_detected = false;
-    for (auto const &names : functor_names) {
-      if (names == "elementwise_add") {
-        elemntwise_add_detected = true;
-        break;
-      }
-    }
-    PADDLE_ENFORCE_EQ(
-        elemntwise_add_detected,
-        true,
-        phi::errors::InvalidArgument(
-            "When the FusedElemwiseAddActivationOp Is used in fused pass, the "
-            "elementwise_add Op must be"
-            "detected and used, Please check the fuse pass pattern"));
-  }
-};
-
-class FusedElemwiseAddActivationOpGrad : public FusedElemwiseActivationOpGrad {
- public:
-  using FusedElemwiseActivationOpGrad::FusedElemwiseActivationOpGrad;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    FusedElemwiseActivationOpGrad::InferShape(ctx);
-    std::vector<std::string> functor_names =
-        ctx->Attrs().Get<std::vector<std::string>>("functor_list");
-    bool elemntwise_add_grad_detected = false;
-    for (auto const &names : functor_names) {
-      if (names == "elementwise_add_grad") {
-        elemntwise_add_grad_detected = true;
-        break;
-      }
-    }
-    PADDLE_ENFORCE_EQ(
-        elemntwise_add_grad_detected,
-        true,
-        phi::errors::InvalidArgument(
-            "When the FusedElemwiseAddActivationOpGrad Is used in fused pass, "
-            "the elementwise_add_grad Op must be"
-            "detected and used, Please check the fuse pass pattern"));
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERER(
-    FusedElemwiseAddActivationNoNeddBufVarInferer, "X", "Y");
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    fused_elemwise_activation,
-    ops::FusedElemwiseActivationOp,
-    ops::FusedElemwiseActivationMaker,
-    ops::FusedElemwiseActivationGradMaker<paddle::framework::OpDesc>,
-    ops::FusedElemwiseActivationGradMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(fused_elemwise_activation_grad,
-                  ops::FusedElemwiseActivationOpGrad);
-
-PD_REGISTER_STRUCT_KERNEL(fused_elemwise_activation,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::FusedElemwiseActivationKernel,
-                          float,
-                          double) {}
-
-PD_REGISTER_STRUCT_KERNEL(fused_elemwise_activation_grad,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::FusedElemwiseActivationGradKernel,
-                          float,
-                          double) {}
-
-// for memory optimization, we register the fused_elemwise_add_activation OP
-REGISTER_OPERATOR(
-    fused_elemwise_add_activation,
-    ops::FusedElemwiseAddActivationOp,
-    ops::FusedElemwiseAddActivationMaker,
-    ops::FusedElemwiseAddActivationGradMaker<paddle::framework::OpDesc>,
-    ops::FusedElemwiseAddActivationGradMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(fused_elemwise_add_activation_grad,
-                  ops::FusedElemwiseAddActivationNoNeddBufVarInferer,
-                  ops::FusedElemwiseAddActivationOpGrad);
-
-PD_REGISTER_STRUCT_KERNEL(fused_elemwise_add_activation,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::FusedElemwiseAddActivationKernel,
-                          float,
-                          double) {}
-
-PD_REGISTER_STRUCT_KERNEL(fused_elemwise_add_activation_grad,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::FusedElemwiseAddActivationGradKernel,
-                          float,
-                          double) {}
diff --git a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cu b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cu
deleted file mode 100644
index d231bbff9b93b..0000000000000
--- a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cu
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/fused/fused_elemwise_activation_op.h"
-
-namespace ops = paddle::operators;
-
-PD_REGISTER_STRUCT_KERNEL(fused_elemwise_activation,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::FusedElemwiseActivationKernel,
-                          float,
-                          double,
-                          phi::dtype::float16) {}
-PD_REGISTER_STRUCT_KERNEL(fused_elemwise_activation_grad,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::FusedElemwiseActivationGradKernel,
-                          float,
-                          double,
-                          phi::dtype::float16) {}
-
-PD_REGISTER_STRUCT_KERNEL(fused_elemwise_add_activation,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::FusedElemwiseAddActivationKernel,
-                          float,
-                          double,
-                          phi::dtype::float16) {}
-PD_REGISTER_STRUCT_KERNEL(fused_elemwise_add_activation_grad,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::FusedElemwiseAddActivationGradKernel,
-                          float,
-                          double,
-                          phi::dtype::float16) {}
diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cc b/paddle/fluid/operators/fused/fused_feedforward_op.cc
index ea8040c763644..685400f167e7c 100644
--- a/paddle/fluid/operators/fused/fused_feedforward_op.cc
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cc
@@ -20,8 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 
-namespace paddle {
-namespace operators {
+namespace paddle::operators {
 
 /**
  * Get row matrix shape from a vector shape. If the rank of x_dim > 1, the
@@ -430,8 +429,7 @@ class FusedFeedForwardOpDoubleGradMaker
  protected:
   void Apply(GradOpPtr<T> grad_op) const override {}
 };
-}  // namespace operators
-}  // namespace paddle
+}  // namespace paddle::operators
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(fused_feedforward,
diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cc b/paddle/fluid/operators/fused/fused_gate_attention_op.cc
index 16aa1398e08d8..717eb990f49f3 100644
--- a/paddle/fluid/operators/fused/fused_gate_attention_op.cc
+++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cc
@@ -17,8 +17,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 
-namespace paddle {
-namespace operators {
+namespace paddle::operators {
 
 using DDim = phi::DDim;
 
@@ -365,8 +364,7 @@ class FusedGateAttentionGradOpMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace paddle::operators
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(
diff --git a/paddle/fluid/operators/fused/fusion_group_op.cc b/paddle/fluid/operators/fused/fusion_group_op.cc
deleted file mode 100644
index b42dd927c6e31..0000000000000
--- a/paddle/fluid/operators/fused/fusion_group_op.cc
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/infermeta/multiary.h"
-
-namespace paddle {
-namespace operators {
-
-class FusionGroupOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(framework::proto::VarType::FP32, phi::GPUPlace(0));
-  };
-};
-
-class FusionGroupOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Inputs",
-             "(std::vector<phi::DenseTensor>) The inputs of fusion_group op.")
-        .AsDuplicable();
-    AddOutput("Outs",
-              "(std::vector<phi::DenseTensor>) The outputs of fusion_group op.")
-        .AsDuplicable();
-    AddAttr<std::vector<int>>("outs_dtype",
-                              "The data type of Outputs in fusion_group op.")
-        .SetDefault({});
-    AddAttr<std::vector<int>>("inputs_dtype",
-                              "The data type of Inputs in fusion_group op.")
-        .SetDefault({});
-    AddAttr<int>("type", "Fusion type.").SetDefault(0);
-    AddAttr<std::string>("func_name", "Name of the generated functions.")
-        .SetDefault("");
-    AddComment(R"DOC(
-fusion_group Operator.
-
-It is used to execute a generated CUDA kernel which fuse the computation of
-multiple operators into one. It supports several types:
-0, fused computation of elementwise operations in which all the dims of inputs
-    and outputs should be exactly the same.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-DECLARE_INFER_SHAPE_FUNCTOR(fusion_group,
-                            FusionGroupInferShapeFunctor,
-                            PD_INFER_META(phi::FusionGroupInferMeta));
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(fusion_group,
-                  ops::FusionGroupOp,
-                  ops::FusionGroupOpMaker,
-                  FusionGroupInferShapeFunctor);
diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc
deleted file mode 100644
index f40ac248f1962..0000000000000
--- a/paddle/fluid/operators/fused/fusion_lstm_op.cc
+++ /dev/null
@@ -1,577 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/fused/fusion_lstm_op.h"
-
-#include <string>
-
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/funcs/fc_functor.h"
-#include "paddle/phi/kernels/funcs/jit/kernels.h"
-#include "paddle/phi/kernels/funcs/sequence2batch.h"
-
-namespace paddle {
-namespace operators {
-
-void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
-  OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "fusion_lstm");
-  OP_INOUT_CHECK(ctx->HasInput("WeightX"), "Input", "WeightX", "fusion_lstm");
-  OP_INOUT_CHECK(ctx->HasInput("WeightH"), "Input", "WeightH", "fusion_lstm");
-  OP_INOUT_CHECK(ctx->HasInput("Bias"), "Input", "Bias", "fusion_lstm");
-  OP_INOUT_CHECK(ctx->HasOutput("XX"), "Output", "XX", "fusion_lstm");
-  OP_INOUT_CHECK(ctx->HasOutput("Hidden"), "Output", "Hidden", "fusion_lstm");
-  OP_INOUT_CHECK(ctx->HasOutput("Cell"), "Output", "Cell", "fusion_lstm");
-
-  auto x_dims = ctx->GetInputDim("X");
-  PADDLE_ENFORCE_EQ(x_dims.size(),
-                    2,
-                    phi::errors::InvalidArgument(
-                        "Input(X)'s rank must be 2, but received x's rank "
-                        "is:%d, x dim is:[%s]",
-                        x_dims.size(),
-                        x_dims));
-
-  if (ctx->HasInput("H0")) {
-    OP_INOUT_CHECK(ctx->HasInput("C0"), "Input", "C0", "fusion_lstm");
-    auto h_dims = ctx->GetInputDim("H0");
-    auto c_dims = ctx->GetInputDim("C0");
-    PADDLE_ENFORCE_EQ(h_dims,
-                      c_dims,
-                      phi::errors::InvalidArgument(
-                          "The dimension of Input(H0) and Input(C0) should be "
-                          "same, but received h0 dims is:[%s], c0 dims is:[%s]",
-                          h_dims,
-                          c_dims));
-  }
-
-  auto wx_dims = ctx->GetInputDim("WeightX");
-  PADDLE_ENFORCE_EQ(wx_dims.size(),
-                    2,
-                    phi::errors::InvalidArgument(
-                        "The rank of Input(WeightX) should be 2, but received "
-                        "WeightX's rank is:%d, WeightX dim is:[%s]",
-                        wx_dims.size(),
-                        wx_dims));
-  PADDLE_ENFORCE_EQ(wx_dims[0],
-                    x_dims[1],
-                    phi::errors::InvalidArgument(
-                        "The first dimension of Input(WeightX) "
-                        "should equal to second dimension of Input(X), but "
-                        "received WeightX first dim is:%d, X second dim is:%d",
-                        wx_dims[0],
-                        x_dims[1]));
-
-  int frame_size = static_cast<int>(wx_dims[1] / 4);
-  auto wh_dims = ctx->GetInputDim("WeightH");
-
-  PADDLE_ENFORCE_EQ(wh_dims.size(),
-                    2,
-                    phi::errors::InvalidArgument(
-                        "The rank of Input(WeightH) should be 2, but received "
-                        "WeightH rank is:%d, WeightH dim is:[%s]",
-                        wh_dims.size(),
-                        wh_dims));
-  PADDLE_ENFORCE_EQ(wh_dims[0],
-                    frame_size,
-                    phi::errors::InvalidArgument(
-                        "The first dimension of Input(WeightH) "
-                        "should equal to frame size, but received WeightH "
-                        "first dim is:%d, frame size is:%d.",
-                        wh_dims[0],
-                        frame_size));
-
-  PADDLE_ENFORCE_EQ(wh_dims[1],
-                    4 * frame_size,
-                    phi::errors::InvalidArgument(
-                        "The second dimension of Input(WeightH) "
-                        "should equal to 4 * frame_size, but received WeightH "
-                        "second dimension is:%d, frame size is:%d.",
-                        wh_dims[1],
-                        frame_size));
-
-  auto b_dims = ctx->GetInputDim("Bias");
-  PADDLE_ENFORCE_EQ(b_dims.size(),
-                    2,
-                    phi::errors::InvalidArgument(
-                        "The rank of Input(Bias) should be 2, but received "
-                        "Bias rank is:%d, Bias dim is:[%s]",
-                        b_dims.size(),
-                        b_dims));
-  PADDLE_ENFORCE_EQ(b_dims[0],
-                    1,
-                    phi::errors::InvalidArgument(
-                        "The first dimension of Input(Bias) should be 1, but "
-                        "received Bias's dimension is:[%s]",
-                        b_dims));
-
-  if (ctx->Attrs().Get<bool>("use_peepholes")) {
-    PADDLE_ENFORCE_EQ(b_dims[1],
-                      7 * frame_size,
-                      phi::errors::InvalidArgument(
-                          "The second dimension of Input(Bias) should be "
-                          "7 * %d if enable peepholes connection, but received "
-                          "Bias dim is:[%s]",
-                          frame_size,
-                          b_dims));
-    ctx->SetOutputDim("CheckedCell", {2, frame_size});
-  } else {
-    PADDLE_ENFORCE_EQ(
-        b_dims[1],
-        4 * frame_size,
-        phi::errors::InvalidArgument(
-            "The second dimension of Input(Bias) should be "
-            "4 * %d if disable peepholes, but received Bias dim is:[%s]",
-            frame_size,
-            b_dims));
-  }
-
-  phi::DDim out_dims({x_dims[0], frame_size});
-  ctx->SetOutputDim("Hidden", out_dims);
-  ctx->SetOutputDim("Cell", out_dims);
-  ctx->ShareLoD("X", "Hidden");
-  ctx->ShareLoD("X", "Cell");
-  int xx_width = 0;
-  if (ctx->Attrs().Get<bool>("use_seq")) {
-    xx_width = static_cast<int>(wx_dims[1]);
-  } else {
-    xx_width =
-        static_cast<int>(x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1]);
-
-    OP_INOUT_CHECK(ctx->HasOutput("BatchedInput"),
-                   "Output",
-                   "BatchedInput",
-                   "fusion_lstm");
-    OP_INOUT_CHECK(ctx->HasOutput("BatchedHidden"),
-                   "Output",
-                   "BatchedHidden",
-                   "fusion_lstm");
-    OP_INOUT_CHECK(
-        ctx->HasOutput("BatchedCell"), "Output", "BatchedCell", "fusion_lstm");
-    OP_INOUT_CHECK(
-        ctx->HasOutput("ReorderedH0"), "Output", "ReorderedH0", "fusion_lstm");
-    OP_INOUT_CHECK(
-        ctx->HasOutput("ReorderedC0"), "Output", "ReorderedC0", "fusion_lstm");
-
-    ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
-    ctx->SetOutputDim("BatchedHidden", out_dims);
-    ctx->SetOutputDim("BatchedCell", out_dims);
-  }
-  ctx->SetOutputDim("XX", {x_dims[0], xx_width});
-  ctx->ShareLoD("X", "XX");
-}
-
-phi::KernelKey FusionLSTMOp::GetExpectedKernelType(
-    const framework::ExecutionContext& ctx) const {
-  auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
-  return phi::KernelKey(data_type, ctx.GetPlace());
-}
-
-void FusionLSTMOpMaker::Make() {
-  AddInput(
-      "X",
-      "(phi::DenseTensor) the input is a LodTensor, which support "
-      "variable-time length input sequence. The underlying tensor in "
-      "this phi::DenseTensor is a matrix with shape (T X M), where T is the "
-      "total time steps in this mini-batch, M is the dim size of x.");
-  AddInput("WeightX",
-           "(phi::DenseTensor) the learnable weights of X."
-           " - The shape is (M x 4D), where M is the dim size of x, D is the "
-           "hidden size. "
-           " - Weight = {W_cx, W_ix, W_fx, W_ox}");
-  AddInput(
-      "WeightH",
-      "(phi::DenseTensor) same as LSTMOp, the learnable hidden-hidden weights."
-      " - The shape is (D x 4D), where D is the hidden size. "
-      " - Weight = {W_ch, W_ih, W_fh, W_oh}");
-  AddInput("Bias",
-           "(phi::DenseTensor) the learnable weights. Almost same as LSTMOp"
-           "Note: we should add the fc bias into this (1x4D) in bias."
-           "input-hidden bias weight and peephole connections weight if "
-           "setting `use_peepholes` True. "
-           "1. `use_peepholes = False` "
-           " - The shape is (1 x 4D). "
-           " - Bias = {b_c, b_i, b_f, b_o}."
-           "2. `use_peepholes = True` "
-           " - The shape is (1 x 7D). "
-           " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}.");
-  AddInput("H0",
-           "(phi::DenseTensor, optional) (same as LSTMOp) the initial hidden "
-           "state is an "
-           "optional "
-           "input. This is a tensor with shape (N x D), where N is the "
-           "batch size and D is the hidden size.")
-      .AsDispensable();
-  AddInput("C0",
-           "(phi::DenseTensor, optional) (same as LSTMOp) (the initial cell "
-           "state is an "
-           "optional "
-           "input. This is a tensor with shape (N x D), where N is the "
-           "batch size. `H0` and `C0` can be NULL but only at the same time.")
-      .AsDispensable();
-  AddOutput(
-      "Hidden",
-      "(phi::DenseTensor) (same as LSTMOp) the hidden state of LSTM operator. "
-      "The shape is (T x D), and lod is the same with the `Input`.");
-  AddOutput(
-      "Cell",
-      "(phi::DenseTensor) (same as LSTMOp) the cell state of LSTM operator. "
-      "The shape is (T x D), and lod is the same with the `Input`.");
-  AddOutput("XX",
-            "(phi::DenseTensor) the result after X * WeightX (size is T x 4D)"
-            " or batched_X (size is T x M), this will be automatically chosen,"
-            " where T is the total time steps in this mini-batch,"
-            " D is the hidden size, M is the dim size of x input.")
-      .AsIntermediate();
-  AddOutput("BatchedInput", "(phi::DenseTensor) (T x 4D).").AsIntermediate();
-  AddOutput("BatchedHidden", "(phi::DenseTensor) (T x D).").AsIntermediate();
-  AddOutput("BatchedCell", "(phi::DenseTensor) (T x D).").AsIntermediate();
-  AddOutput("ReorderedH0", "(phi::DenseTensor) (N x D).").AsIntermediate();
-  AddOutput("ReorderedC0", "(phi::DenseTensor) (N x D).").AsIntermediate();
-  AddOutput("CheckedCell", "(phi::DenseTensor) (2 x D) only for peephole.")
-      .AsIntermediate();
-  AddAttr<bool>("use_peepholes",
-                "(bool, default: True) "
-                "whether to enable diagonal/peephole connections.")
-      .SetDefault(true);
-  AddAttr<bool>("is_reverse",
-                "(bool, default: False) "
-                "whether to compute reversed LSTM.")
-      .SetDefault(false);
-  AddAttr<bool>("use_seq",
-                "(bool, default: True) "
-                "whether to use seq mode to compute.")
-      .SetDefault(true);
-  AddAttr<std::string>("gate_activation",
-                       "(string, default: sigmoid)"
-                       "The activation for input gate, forget gate and output "
-                       "gate, `sigmoid` by default.")
-      .SetDefault("sigmoid")
-      .InEnum({"sigmoid", "tanh", "relu", "identity"});
-  AddAttr<std::string>("cell_activation",
-                       "(string, default: tanh)"
-                       "The activation for cell output, `tanh` by default.")
-      .SetDefault("tanh")
-      .InEnum({"sigmoid", "tanh", "relu", "identity"});
-  AddAttr<std::string>("candidate_activation",
-                       "(string, default: tanh)"
-                       "The activation for candidate hidden state, "
-                       "`tanh` by default.")
-      .SetDefault("tanh")
-      .InEnum({"sigmoid", "tanh", "relu", "identity"});
-  AddAttr<float>("Scale_data",
-                 "Scale to be used for int8 input/output data."
-                 "Only used with MKL-DNN INT8.")
-      .SetDefault(1.0f);
-  AddAttr<float>("Shift_data",
-                 "Shift to be used for int8 input/output data."
-                 "Only used with MKL-DNN INT8.")
-      .SetDefault(0.0f);
-  AddAttr<std::vector<float>>("Scale_weights",
-                              "Scale_weights to be used for int8 weights data."
-                              "Only used with MKL-DNN INT8.")
-      .SetDefault({1.0f});
-  AddAttr<bool>("force_fp32_output",
-                "(bool, default false) Force INT8 kernel output FP32, only "
-                "used in MKL-DNN INT8")
-      .SetDefault(false);
-  AddComment(R"DOC(
-Fusion Long-Short Term Memory (LSTM) Operator.
-This operator fuse the X into LSTM, more details can refer to LSTM op.
-)DOC");
-}
-
-template <typename T, typename DeviceContext>
-class FusionLSTMKernel : public framework::OpKernel<T> {
- public:
-#define INIT_BASE_DEFINES                                    \
-  auto* x = ctx.Input<phi::DenseTensor>("X");                \
-  auto* h0 = ctx.Input<phi::DenseTensor>("H0");              \
-  auto* c0 = ctx.Input<phi::DenseTensor>("C0");              \
-  auto* wx = ctx.Input<phi::DenseTensor>("WeightX");         \
-  auto* wh = ctx.Input<phi::DenseTensor>("WeightH");         \
-  auto* bias = ctx.Input<phi::DenseTensor>("Bias");          \
-  auto* xx = ctx.Output<phi::DenseTensor>("XX");             \
-  auto* hidden_out = ctx.Output<phi::DenseTensor>("Hidden"); \
-  auto* cell_out = ctx.Output<phi::DenseTensor>("Cell");     \
-  bool is_reverse = ctx.Attr<bool>("is_reverse");            \
-  bool use_peepholes = ctx.Attr<bool>("use_peepholes");      \
-  auto x_dims = x->dims();   /* T x M*/                      \
-  auto wh_dims = wh->dims(); /* D x 4D*/                     \
-  const int M = x_dims[1];                                   \
-  const int D = wh_dims[0];                                  \
-  const int D4 = wh_dims[1]
-
-#define INIT_OTHER_DEFINES                                                    \
-  const T* x_data = x->data<T>();                                             \
-  const T* wx_data = wx->data<T>();                                           \
-  const T* wh_data = wh->data<T>();                                           \
-  /* diagonal weight*/                                                        \
-  const T* wp_data = bias->data<T>() + D4;                                    \
-  /* for peephole only*/                                                      \
-  T* checked_cell_data = nullptr;                                             \
-  auto place = ctx.GetPlace();                                                \
-  if (use_peepholes) {                                                        \
-    /* w_ic * Ct-1, w_fc * Ct-1  ; w_oc * Ct => ih*/                          \
-    auto* checked_cell = ctx.Output<phi::DenseTensor>("CheckedCell");         \
-    checked_cell_data = checked_cell->mutable_data<T>(place);                 \
-  }                                                                           \
-  const phi::jit::lstm_attr_t attr(                                           \
-      D,                                                                      \
-      phi::jit::to_kerneltype(ctx.Attr<std::string>("gate_activation")),      \
-      phi::jit::to_kerneltype(ctx.Attr<std::string>("candidate_activation")), \
-      phi::jit::to_kerneltype(ctx.Attr<std::string>("cell_activation")),      \
-      use_peepholes);                                                         \
-  phi::jit::lstm_t one_step;                                                  \
-  one_step.wp = wp_data;                                                      \
-  one_step.checked = checked_cell_data;                                       \
-  auto ComputeC1H1 = phi::jit::KernelFuncs<phi::jit::LSTMC1H1Tuple<T>,        \
-                                           phi::CPUPlace>::Cache()            \
-                         .At(attr);                                           \
-  auto ComputeCtHt = phi::jit::KernelFuncs<phi::jit::LSTMCtHtTuple<T>,        \
-                                           phi::CPUPlace>::Cache()            \
-                         .At(attr)
-
-// Wh GEMM
-#define GEMM_WH_ADDON(bs, prev, out) \
-  blas.GEMM(CblasNoTrans,            \
-            CblasNoTrans,            \
-            bs,                      \
-            D4,                      \
-            D,                       \
-            static_cast<T>(1),       \
-            prev,                    \
-            D,                       \
-            wh_data,                 \
-            D4,                      \
-            static_cast<T>(1),       \
-            out,                     \
-            D4)
-
-  void SeqCompute(const framework::ExecutionContext& ctx) const {
-    INIT_BASE_DEFINES;
-    INIT_OTHER_DEFINES;
-    auto x_lod = x->lod();
-    const int total_T = static_cast<int>(x_dims[0]);
-    const int N = static_cast<int>(x_lod[0].size() - 1);
-    const T* h0_data = h0 ? h0->data<T>() : nullptr;
-    const T* c0_data = c0 ? c0->data<T>() : nullptr;
-    T* xx_data = xx->mutable_data<T>(place);
-    T* h_out_data = hidden_out->mutable_data<T>(place);
-    T* c_out_data = cell_out->mutable_data<T>(place);
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-
-    phi::funcs::FCFunctor<DeviceContext, T> fc;
-    fc(dev_ctx, total_T, D4, M, x_data, wx_data, xx_data, bias->data<T>());
-
-    int xx_offset = D4;
-    int gate_offset = D;
-    if (is_reverse) {
-      const int offset = (total_T - 1) * D;
-      xx_data = xx_data + offset * 4;
-      h_out_data = h_out_data + offset;
-      c_out_data = c_out_data + offset;
-      xx_offset = -D4;
-      gate_offset = -D;
-    }
-
-    for (int i = 0; i < N; ++i) {
-      int bid = is_reverse ? N - 1 - i : i;
-      int seq_len = static_cast<int>(x_lod[0][bid + 1] - x_lod[0][bid]);
-      const T* prev_c_data = nullptr;
-      const T* prev_h_data = nullptr;
-      int tstart = 0;
-      if (h0_data) {
-        prev_h_data = h0_data + bid * D;
-        prev_c_data = c0_data + bid * D;
-      } else {
-        one_step.gates = xx_data;
-        one_step.ct = c_out_data;
-        one_step.ht = h_out_data;
-        ComputeC1H1(&one_step, &attr);
-        tstart = 1;
-        // move one step
-        prev_h_data = h_out_data;
-        prev_c_data = c_out_data;
-        xx_data = xx_data + xx_offset;
-        h_out_data = h_out_data + gate_offset;
-        c_out_data = c_out_data + gate_offset;
-      }
-      for (int step = tstart; step < seq_len; ++step) {
-        GEMM_WH_ADDON(1, prev_h_data, xx_data);
-
-        one_step.gates = xx_data;
-        one_step.ct_1 = prev_c_data;
-        one_step.ct = c_out_data;
-        one_step.ht = h_out_data;
-        ComputeCtHt(&one_step, &attr);
-        // move one step
-        prev_h_data = h_out_data;
-        prev_c_data = c_out_data;
-        xx_data = xx_data + xx_offset;
-        h_out_data = h_out_data + gate_offset;
-        c_out_data = c_out_data + gate_offset;
-      }
-    }
-  }
-
-  void BatchCompute(const framework::ExecutionContext& ctx) const {
-    INIT_BASE_DEFINES;
-    if (x->lod()[0].size() == 2) {
-      xx->Resize({x_dims[0], D4});
-      SeqCompute(ctx);
-      return;
-    }
-    INIT_OTHER_DEFINES;
-
-    auto* reordered_h0 = ctx.Output<phi::DenseTensor>("ReorderedH0");
-    auto* reordered_c0 = ctx.Output<phi::DenseTensor>("ReorderedC0");
-    auto* batched_input = ctx.Output<phi::DenseTensor>("BatchedInput");
-    auto* batched_c_out = ctx.Output<phi::DenseTensor>("BatchedCell");
-    auto* batched_h_out = ctx.Output<phi::DenseTensor>("BatchedHidden");
-    T* xx_data = xx->mutable_data<T>(place);
-    T* batched_input_data = batched_input->mutable_data<T>(place);
-    T* batched_c_out_data = batched_c_out->mutable_data<T>(place);
-    T* batched_h_out_data = batched_h_out->mutable_data<T>(place);
-    hidden_out->mutable_data<T>(place);
-    cell_out->mutable_data<T>(place);
-
-    phi::funcs::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-    phi::funcs::FCFunctor<DeviceContext, T> fc;
-    if (M > D4) {
-      fc(dev_ctx, x_dims[0], D4, M, x_data, wx_data, xx_data, bias->data<T>());
-      to_batch(dev_ctx, *xx, batched_input, true, is_reverse);
-    } else {
-      to_batch(dev_ctx, *x, xx, true, is_reverse);
-      batched_input->set_lod(xx->lod());
-      fc(dev_ctx,
-         x_dims[0],
-         D4,
-         M,
-         xx_data,
-         wx_data,
-         batched_input_data,
-         bias->data<T>());
-    }
-
-    auto batched_lod = batched_input->lod();
-    const auto& seq_order = batched_lod[2];
-    const int max_bs = static_cast<int>(seq_order.size());
-    reordered_h0->Resize({max_bs, D});
-    reordered_c0->Resize({max_bs, D});
-
-    int tstart = 0;
-    T* prev_h_data = nullptr;
-    T* prev_c_data = nullptr;
-    if (h0) {
-      // reorder h0, c0
-      T* reordered_h0_data = reordered_h0->mutable_data<T>(place);
-      T* reordered_c0_data = reordered_c0->mutable_data<T>(place);
-      const T* h0_data = h0->data<T>();
-      const T* c0_data = c0->data<T>();
-      prev_h_data = reordered_h0_data;
-      prev_c_data = reordered_c0_data;
-      size_t sz = D;
-      for (int i = 0; i < max_bs; ++i) {
-        blas.VCOPY(sz, h0_data + seq_order[i] * D, reordered_h0_data);
-        blas.VCOPY(sz, c0_data + seq_order[i] * D, reordered_c0_data);
-        reordered_h0_data += D;
-        reordered_c0_data += D;
-      }
-    } else {
-      // compute without h0, c0
-      T* cur_in_data = batched_input_data;
-      T* cur_h_out_data = batched_h_out_data;
-      T* cur_c_out_data = batched_c_out_data;
-      for (int i = 0; i < max_bs; ++i) {
-        one_step.gates = cur_in_data;
-        one_step.ct = cur_c_out_data;
-        one_step.ht = cur_h_out_data;
-        ComputeC1H1(&one_step, &attr);
-
-        cur_in_data += D4;
-        cur_c_out_data += D;
-        cur_h_out_data += D;
-      }
-      tstart = 1;
-      prev_h_data = batched_h_out_data;
-      prev_c_data = batched_c_out_data;
-    }
-
-    // compute kernel part
-    const auto& batch_starts = batched_lod[0];
-    const int max_seq_len = static_cast<int>(batch_starts.size() - 1);
-    const int offset = tstart * max_bs * D;
-    batched_input_data = batched_input_data + offset * 4;
-    batched_h_out_data = batched_h_out_data + offset;
-    batched_c_out_data = batched_c_out_data + offset;
-    for (int step = tstart; step < max_seq_len; ++step) {
-      const int cur_bs =
-          static_cast<int>(batch_starts[step + 1] - batch_starts[step]);
-      GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data);
-      T* cur_in_data = batched_input_data;
-      T* cur_prev_c_data = prev_c_data;
-      T* cur_c_out_data = batched_c_out_data;
-      T* cur_h_out_data = batched_h_out_data;
-      for (int i = 0; i < cur_bs; ++i) {
-        one_step.gates = cur_in_data;
-        one_step.ct_1 = cur_prev_c_data;
-        one_step.ct = cur_c_out_data;
-        one_step.ht = cur_h_out_data;
-        ComputeCtHt(&one_step, &attr);
-
-        // move one batch
-        cur_in_data += D4;
-        cur_prev_c_data += D;
-        cur_c_out_data += D;
-        cur_h_out_data += D;
-      }
-      // move one step
-      prev_c_data = batched_c_out_data;
-      prev_h_data = batched_h_out_data;
-      batched_c_out_data = cur_c_out_data;
-      batched_h_out_data = cur_h_out_data;
-      batched_input_data = cur_in_data;
-    }
-
-    phi::funcs::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
-    batched_h_out->set_lod(batched_lod);
-    to_seq(dev_ctx, *batched_h_out, hidden_out);
-    batched_c_out->set_lod(batched_lod);
-    to_seq(dev_ctx, *batched_c_out, cell_out);
-  }
-
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    if (ctx.Attr<bool>("use_seq")) {
-      SeqCompute(ctx);
-    } else {
-      BatchCompute(ctx);
-    }
-  }
-
-#undef GEMM_WH_ADDON
-#undef INIT_OTHER_DEFINES
-#undef INIT_BASE_DEFINES
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(fusion_lstm, ops::FusionLSTMOp, ops::FusionLSTMOpMaker);
-
-PD_REGISTER_STRUCT_KERNEL(
-    fusion_lstm, CPU, ALL_LAYOUT, ops::FusionLSTMKernel, float, double) {}
diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.h b/paddle/fluid/operators/fused/fusion_lstm_op.h
deleted file mode 100644
index c62060d7c225c..0000000000000
--- a/paddle/fluid/operators/fused/fusion_lstm_op.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-class FusionLSTMOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override;
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override;
-};
-
-class FusionLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override;
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/onednn/fusion_lstm_onednn_op.cc b/paddle/fluid/operators/fused/onednn/fusion_lstm_onednn_op.cc
deleted file mode 100644
index 05c517fd9ac09..0000000000000
--- a/paddle/fluid/operators/fused/onednn/fusion_lstm_onednn_op.cc
+++ /dev/null
@@ -1,476 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/operators/fused/fusion_lstm_op.h"
-#include "paddle/fluid/operators/fused/onednn/fusion_rnn_onednn.h"
-#include "paddle/phi/core/expect.h"
-
-namespace paddle {
-namespace operators {
-
-using phi::OneDNNContext;
-using phi::funcs::OneDNNGetDataType;
-using phi::funcs::OneDNNMemDesc;
-using phi::funcs::RNNReorderType;
-using OneDNNMemoryFormat = dnnl::memory::format_tag;
-
-template <typename T, typename T_out = T>
-class LSTMMKLDNNHandler
-    : public RNNMKLDNNHandler<T, dnnl::lstm_forward, T_out> {
- public:
-  LSTMMKLDNNHandler(const paddle::framework::ExecutionContext& ctx,
-                    const OneDNNContext& dev_ctx,
-                    const dnnl::engine onednn_engine,
-                    phi::Place cpu_place UNUSED,
-                    const phi::DenseTensor* input,
-                    const phi::DenseTensor* weight_h,
-                    const phi::DenseTensor* h0,
-                    const phi::DenseTensor* c0 UNUSED,
-                    const bool is_reverse,
-                    const int64_t N,
-                    const int64_t Ti,
-                    const int64_t IC,
-                    const int64_t OC,
-                    const std::string& unique_name UNUSED)
-      : RNNMKLDNNHandler<T, dnnl::lstm_forward, T_out>(
-            ctx,
-            dev_ctx,
-            onednn_engine,
-            ctx.GetPlace(),
-            input,
-            weight_h,
-            h0,
-            is_reverse,
-            N,
-            Ti,
-            IC,
-            OC,
-            4,
-            ctx.InputName("X") + ctx.InputName("WeightH")) {
-    if (unlikely(!this->isCached())) {
-      const bool is_INT8 = std::is_same<T, uint8_t>::value;
-      const bool use_peepholes = ctx.Attr<bool>("use_peepholes");
-      // oneDNN kernel has hardcoded activation functions
-      PADDLE_ENFORCE_EQ(
-          ctx.Attr<std::string>("gate_activation"),
-          "sigmoid",
-          phi::errors::Unimplemented("oneDNN fusion_lstm supports only "
-                                     "sigmoid as a gate activation."));
-      PADDLE_ENFORCE_EQ(
-          ctx.Attr<std::string>("cell_activation"),
-          "tanh",
-          phi::errors::Unimplemented(
-              "oneDNN fusion_lstm supports only tanh as a cell activation."));
-      PADDLE_ENFORCE_EQ(
-          ctx.Attr<std::string>("candidate_activation"),
-          "tanh",
-          phi::errors::Unimplemented(
-              "oneDNN fusion_lstm supports only tanh a candidate activation."));
-
-      // Weights for int8 kernel are of a type s8
-      const auto weights_dt =
-          is_INT8 ? dnnl::memory::data_type::s8 : OneDNNGetDataType<T>();
-
-      // oneDNN RNN dimensions
-      const int64_t D = 1;  // Directions
-      const int64_t L = 1;  // Layers (PP supports only 1 stacked layer)
-      const int64_t G = 4;  // Number of Gates, 4 for LSTM
-
-      // Create memory descriptors
-      auto input_md = OneDNNMemDesc(
-          {Ti, N, IC}, OneDNNGetDataType<T>(), OneDNNMemoryFormat::tnc);
-      auto weight_x_md =
-          OneDNNMemDesc({L, D, IC, G, OC}, weights_dt, OneDNNMemoryFormat::any);
-      auto weight_h_md =
-          OneDNNMemDesc({L, D, OC, G, OC}, weights_dt, OneDNNMemoryFormat::any);
-      auto bias_md = OneDNNMemDesc(
-          {L, D, G, OC}, OneDNNGetDataType<float>(), OneDNNMemoryFormat::ldgo);
-      auto hidden_md = OneDNNMemDesc(
-          {Ti, N, OC}, OneDNNGetDataType<T_out>(), OneDNNMemoryFormat::any);
-
-      auto h0_md = OneDNNMemDesc(
-          {L, D, N, OC}, OneDNNGetDataType<T>(), OneDNNMemoryFormat::any);
-      auto c0_md = OneDNNMemDesc(
-          {L, D, N, OC}, OneDNNGetDataType<float>(), OneDNNMemoryFormat::any);
-
-      // Create LSTM oneDNN primitive
-      const auto direction =
-          is_reverse ? dnnl::rnn_direction::unidirectional_right2left
-                     : dnnl::rnn_direction::unidirectional_left2right;
-      if (!use_peepholes) {
-        this->AcquireForwardPrimitiveDescriptor(
-            this->attr_,
-            dnnl::prop_kind::forward_inference,
-            direction,
-            input_md,
-            h0_md,
-            c0_md,
-            weight_x_md,
-            weight_h_md,
-            bias_md,
-            hidden_md,
-            dnnl::memory::desc(),
-            dnnl::memory::desc());
-      } else {
-        auto weight_peephole_md = OneDNNMemDesc({L, D, 3, OC},
-                                                OneDNNGetDataType<float>(),
-                                                OneDNNMemoryFormat::ldgo);
-        this->AcquireForwardPrimitiveDescriptor(
-            this->attr_,
-            dnnl::prop_kind::forward_inference,
-            direction,
-            input_md,
-            h0_md,
-            c0_md,
-            weight_x_md,
-            weight_h_md,
-            weight_peephole_md,
-            bias_md,
-            hidden_md,
-            dnnl::memory::desc(),
-            dnnl::memory::desc());
-      }
-    }
-  }
-
-  // PaddlePaddle has different order of weights than oneDNN, so a reorder is
-  // needed
-  // PaddlePaddle:  {c, i, f, o}
-  // oneDNN:        {i, f, c, o}
-  template <typename U>
-  void ReorderGates(U* weights, int64_t I) {
-    size_t inner_block_size = this->OC;
-    size_t block_size = inner_block_size * this->G;
-    for (size_t i = 0; i < (size_t)I; ++i) {  // NOLINT
-      size_t offset = i * block_size;
-
-      U* base_pos = weights + offset;
-      std::swap_ranges(base_pos,
-                       base_pos + inner_block_size,
-                       base_pos + inner_block_size);  // c <-> i
-      std::swap_ranges(base_pos + inner_block_size,
-                       base_pos + 2 * inner_block_size,
-                       base_pos + 2 * inner_block_size);  // c <-> f
-    }
-  }
-
-  template <typename U>
-  std::shared_ptr<dnnl::memory> AcquireWeightXMemory(
-      const phi::DenseTensor* weight_x) {
-    const std::string wx_key = this->memory_key_ + "@weight_x";
-    auto memory_p =
-        std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(wx_key));
-
-    if (!memory_p) {
-      auto user_md = OneDNNMemDesc({1, 1, this->IC, this->G, this->OC},
-                                   OneDNNGetDataType<U>(),
-                                   OneDNNMemoryFormat::ldigo);
-      auto user_memory = dnnl::memory(user_md, this->engine_);
-
-      auto* weight_x_data = reinterpret_cast<U*>(user_memory.get_data_handle());
-      memcpy(weight_x_data,
-             weight_x->data<U>(),
-             sizeof(U) * this->IC * this->G * this->OC);
-
-      ReorderGates(weight_x_data, this->IC);
-
-      memory_p = std::make_shared<dnnl::memory>(
-          this->fwd_pd_->weights_layer_desc(), this->engine_);
-
-      auto& astream = OneDNNContext::tls().get_stream();
-      dnnl::reorder(user_memory, *memory_p, this->attr_)
-          .execute(astream, user_memory, *memory_p);
-
-      this->dev_ctx_.SetBlob(wx_key, memory_p);
-    }
-    return memory_p;
-  }
-
-  template <typename U>
-  std::shared_ptr<dnnl::memory> AcquireWeightHMemory(
-      const phi::DenseTensor* weight_h) {
-    const std::string wh_key = this->memory_key_ + "@weight_h";
-    auto memory_p =
-        std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(wh_key));
-
-    if (!memory_p) {
-      auto user_md = OneDNNMemDesc({1, 1, this->OC, this->G, this->OC},
-                                   OneDNNGetDataType<U>(),
-                                   OneDNNMemoryFormat::ldigo);
-      auto user_memory = dnnl::memory(user_md, this->engine_);
-
-      auto* weight_h_data = reinterpret_cast<U*>(user_memory.get_data_handle());
-      memcpy(weight_h_data,
-             weight_h->data<U>(),
-             sizeof(U) * this->OC * this->G * this->OC);
-
-      ReorderGates(weight_h_data, this->OC);
-
-      memory_p = std::make_shared<dnnl::memory>(
-          this->fwd_pd_->weights_iter_desc(), this->engine_);
-
-      auto& astream = OneDNNContext::tls().get_stream();
-      dnnl::reorder(user_memory, *memory_p, this->attr_)
-          .execute(astream, user_memory, *memory_p);
-
-      this->dev_ctx_.SetBlob(wh_key, memory_p);
-    }
-    return memory_p;
-  }
-
-  std::shared_ptr<dnnl::memory> AcquireBiasMemory(
-      const phi::DenseTensor* bias) {
-    const std::string bias_key = this->memory_key_ + "@bias";
-    auto memory_p = std::static_pointer_cast<dnnl::memory>(
-        this->dev_ctx_.GetBlob(bias_key));
-
-    if (!memory_p) {
-      memory_p = std::make_shared<dnnl::memory>(this->fwd_pd_->bias_desc(),
-                                                this->engine_);
-      auto* bias_data = reinterpret_cast<float*>(memory_p->get_data_handle());
-      if (bias) {
-        const float* user_bias_data =
-            bias->data<float>();  // Bias in oneDNN is always float
-
-        memcpy(bias_data, user_bias_data, sizeof(float) * this->G * this->OC);
-
-        ReorderGates(bias_data, 1);
-      } else {
-        // oneDNN always need bias memory, if it's not provided in PP, let
-        // oneDNN allocate memory and set it to 0
-        memset(bias_data, 0, sizeof(float) * this->G * this->OC);
-      }
-
-      this->dev_ctx_.SetBlob(bias_key, memory_p);
-    }
-    return memory_p;
-  }
-
-  std::shared_ptr<dnnl::memory> AcquirePeepholeWeights(
-      const phi::DenseTensor* bias) {
-    const std::string peepholes_key = this->memory_key_ + "@peepholes_weights";
-    auto memory_p = std::static_pointer_cast<dnnl::memory>(
-        this->dev_ctx_.GetBlob(peepholes_key));
-
-    if (!memory_p) {
-      auto user_md = OneDNNMemDesc({1, 1, 3, this->OC},
-                                   OneDNNGetDataType<float>(),
-                                   OneDNNMemoryFormat::ldgo);
-      auto user_memory = dnnl::memory(user_md, this->engine_);
-      memory_p = std::make_shared<dnnl::memory>(
-          this->fwd_pd_->weights_peephole_desc(), this->engine_);
-      auto* peephole_weights_data =
-          reinterpret_cast<float*>(memory_p->get_data_handle());
-
-      const float* user_bias_data =
-          bias->data<float>();  // Bias in oneDNN is always float
-      memcpy(peephole_weights_data,
-             user_bias_data + 4 * this->OC,
-             sizeof(float) * 3 * this->OC);
-
-      this->dev_ctx_.SetBlob(peepholes_key, memory_p);
-    }
-    return memory_p;
-  }
-
-  std::shared_ptr<dnnl::memory> AcquireC0Memory(const phi::DenseTensor* c0) {
-    const std::string c0_key = this->memory_key_ + "@c0";
-    auto memory_p =
-        std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(c0_key));
-
-    if (!memory_p) {
-      auto user_c0_memory = dnnl::memory();
-      if (c0) {
-        user_c0_memory =
-            dnnl::memory({{1, 1, this->N, this->OC},
-                          OneDNNGetDataType<float>(),
-                          OneDNNMemoryFormat::ldnc},
-                         this->engine_,
-                         phi::funcs::to_void_cast(c0->data<float>()));
-      } else {
-        user_c0_memory = dnnl::memory({{1, 1, this->N, this->OC},
-                                       OneDNNGetDataType<float>(),
-                                       OneDNNMemoryFormat::ldnc},
-                                      this->engine_);
-        memset(user_c0_memory.get_data_handle(),
-               0,
-               sizeof(float) * this->N * this->OC);
-      }
-      memory_p = std::make_shared<dnnl::memory>(
-          this->fwd_pd_->src_iter_c_desc(), this->engine_);
-
-      auto& astream = OneDNNContext::tls().get_stream();
-      dnnl::reorder(user_c0_memory, *memory_p)
-          .execute(astream, user_c0_memory, *memory_p);
-
-      this->dev_ctx_.SetBlob(c0_key, memory_p);
-    }
-    return memory_p;
-  }
-};
-
-template <typename T, typename DeviceContext>
-class FusionLSTMMKLDNNKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const bool is_bf16 = std::is_same<T, phi::dtype::bfloat16>::value;
-    const bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
-
-    // BF16 does not support force output
-    if (!is_bf16 && force_fp32_output) {  // NOLINT
-      RunKernel<float>(ctx);
-    } else {
-      RunKernel<T>(ctx);
-    }
-  }
-
-  template <typename Tout = T>
-  void RunKernel(const framework::ExecutionContext& ctx) const {
-    auto& dev_ctx = ctx.template device_context<OneDNNContext>();
-    const auto& onednn_engine = dev_ctx.GetEngine();
-
-    // Get Tensors
-    const auto* input = ctx.Input<phi::DenseTensor>("X");
-    const auto* h0 = ctx.Input<phi::DenseTensor>("H0");
-    const auto* c0 = ctx.Input<phi::DenseTensor>("C0");
-    const auto* weight_x = ctx.Input<phi::DenseTensor>("WeightX");
-    const auto* weight_h = ctx.Input<phi::DenseTensor>("WeightH");
-    const auto* bias = ctx.Input<phi::DenseTensor>("Bias");
-    auto* hidden = ctx.Output<phi::DenseTensor>("Hidden");
-    auto x_dims = input->dims();
-    auto x_mat_dims = (x_dims.size() == 3 && x_dims[1] == 1)
-                          ? common::flatten_to_2d(x_dims, 1)
-                          : x_dims;
-    // Get attributes
-    const bool is_reverse = ctx.Attr<bool>("is_reverse");
-    const bool use_peepholes = ctx.Attr<bool>("use_peepholes");
-
-    // Get tensor dimensions
-    const auto x_mat_dims_vec = common::vectorize(x_mat_dims);
-    const auto weight_h_dims = common::vectorize(weight_h->dims());
-    const auto& input_lod = input->lod()[0];
-
-    // Calculate RNN dimensions
-    const int64_t N = input_lod.size() - 1;  // Number of sentences (batches)
-    const int64_t Ti =  // Max length of the sentence in a batch
-        [&input_lod]() {
-          size_t res = 0;
-          for (size_t i = 0; i < (input_lod.size() - 1); ++i) {
-            res = std::max(res, input_lod[i + 1] - input_lod[i]);
-          }
-          return res;
-        }();
-    const int64_t IC = x_mat_dims_vec[1];  // Input channels
-    const int64_t OC = weight_h_dims[0];   // Output channels
-
-    LSTMMKLDNNHandler<T, Tout> handler(
-        ctx,
-        dev_ctx,
-        onednn_engine,
-        ctx.GetPlace(),
-        input,
-        weight_h,
-        h0,
-        c0,
-        is_reverse,
-        N,
-        Ti,
-        IC,
-        OC,
-        ctx.InputName("X") + ctx.InputName("WeightH"));
-
-    auto input_memory_p =
-        handler.AcquireInputMemoryWithReorder(input, is_reverse);
-    auto c0_memory_p = handler.AcquireC0Memory(c0);
-
-    std::shared_ptr<dnnl::memory> h0_memory_p, weight_h_memory_p,
-        weight_x_memory_p;
-
-    if (weight_h->dtype() == phi::DataType::FLOAT32) {
-      h0_memory_p = handler.template AcquireH0Memory<float>(h0);
-      weight_x_memory_p =
-          handler.template AcquireWeightXMemory<float>(weight_x);
-      weight_h_memory_p =
-          handler.template AcquireWeightHMemory<float>(weight_h);
-    } else if (weight_h->dtype() == phi::DataType::BFLOAT16) {
-      h0_memory_p = handler.template AcquireH0Memory<phi::dtype::bfloat16>(h0);
-      weight_x_memory_p =
-          handler.template AcquireWeightXMemory<phi::dtype::bfloat16>(weight_x);
-      weight_h_memory_p =
-          handler.template AcquireWeightHMemory<phi::dtype::bfloat16>(weight_h);
-    } else {
-      h0_memory_p = handler.template AcquireH0Memory<uint8_t>(h0);
-      weight_x_memory_p =
-          handler.template AcquireWeightXMemory<int8_t>(weight_x);
-      weight_h_memory_p =
-          handler.template AcquireWeightHMemory<int8_t>(weight_h);
-    }
-
-    auto bias_memory_p = handler.AcquireBiasMemory(bias);
-    auto hidden_onednn_memory_p = handler.AcquireOutputMemory();
-
-    std::unordered_map<int, dnnl::memory> lstm_args = {
-        {DNNL_ARG_SRC_LAYER, *input_memory_p},
-        {DNNL_ARG_SRC_ITER, *h0_memory_p},
-        {DNNL_ARG_SRC_ITER_C, *c0_memory_p},
-        {DNNL_ARG_WEIGHTS_LAYER, *weight_x_memory_p},
-        {DNNL_ARG_WEIGHTS_ITER, *weight_h_memory_p},
-        {DNNL_ARG_BIAS, *bias_memory_p},
-        {DNNL_ARG_DST_LAYER, *hidden_onednn_memory_p}};
-
-    if (use_peepholes) {
-      auto peephole_weight_p = handler.AcquirePeepholeWeights(bias);
-      std::pair<int, dnnl::memory> peepholes_weights(DNNL_ARG_WEIGHTS_PEEPHOLE,
-                                                     *peephole_weight_p);
-      lstm_args.insert(peepholes_weights);
-    }
-
-    auto lstm_forward_p = handler.AcquireForwardPrimitive();
-
-    auto& astream = OneDNNContext::tls().get_stream();
-    lstm_forward_p->execute(astream, lstm_args);
-    astream.wait();
-
-    auto* hidden_onednn_data = hidden_onednn_memory_p->get_data_handle();
-    auto* hidden_data =
-        phi::funcs::to_void_cast(hidden->mutable_data<Tout>(ctx.GetPlace()));
-    if (handler.is_NTC()) {
-      handler.reorderRNNdata(hidden_onednn_data,
-                             hidden_data,
-                             input_lod,
-                             is_reverse,
-                             RNNReorderType::NTC_PP);
-    } else {
-      handler.reorderRNNdata(hidden_onednn_data,
-                             hidden_data,
-                             input_lod,
-                             is_reverse,
-                             RNNReorderType::TNC_PP);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-PD_REGISTER_STRUCT_KERNEL(fusion_lstm,
-                          OneDNN,
-                          ONEDNN,
-                          ops::FusionLSTMMKLDNNKernel,
-                          float,
-                          uint8_t,
-                          phi::dtype::bfloat16) {}
diff --git a/paddle/fluid/operators/fused/resnet_unit_op.cc b/paddle/fluid/operators/fused/resnet_unit_op.cc
index 30f1aff92a256..9f8b8d0744ffe 100644
--- a/paddle/fluid/operators/fused/resnet_unit_op.cc
+++ b/paddle/fluid/operators/fused/resnet_unit_op.cc
@@ -15,8 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/common/float16.h"
 
-namespace paddle {
-namespace operators {
+namespace paddle::operators {
 
 // Shape of bitmask
 static phi::DDim GetBitmaskDims(std::vector<int> out_shape) {
@@ -450,8 +449,7 @@ class ResNetUnitOpInferVarType
   }
 };
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace paddle::operators
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(resnet_unit,
diff --git a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cc b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cc
index 851c448865363..14021c1a2f659 100644
--- a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cc
+++ b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cc
@@ -13,8 +13,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/generator.h"
-namespace paddle {
-namespace operators {
+namespace paddle::operators {
 
 class SoftmaxMaskFuseUpperTriangleOp : public framework::OperatorWithKernel {
  public:
@@ -89,8 +88,7 @@ class SoftmaxMaskFuseUpperTriangleGradOpMaker
   }
 };
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace paddle::operators
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(
diff --git a/paddle/fluid/operators/load_combine_op.cc b/paddle/fluid/operators/load_combine_op.cc
index bcd27e8186b7f..40680dbf00829 100644
--- a/paddle/fluid/operators/load_combine_op.cc
+++ b/paddle/fluid/operators/load_combine_op.cc
@@ -76,7 +76,7 @@ that were saved using the SaveCombine operator.
 }  // namespace operators
 }  // namespace paddle
 
-namespace ops = paddle::operators;
+namespace ops = paddle::operators;  // NOLINT
 
 REGISTER_OPERATOR(load_combine,
                   ops::LoadCombineOp,
diff --git a/paddle/fluid/operators/logspace_op.cc b/paddle/fluid/operators/logspace_op.cc
index 171ee209ebd0e..4088f4ba0f291 100644
--- a/paddle/fluid/operators/logspace_op.cc
+++ b/paddle/fluid/operators/logspace_op.cc
@@ -20,8 +20,7 @@
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/multiary.h"
 
-namespace paddle {
-namespace operators {
+namespace paddle::operators {
 
 class LogspaceOp : public framework::OperatorWithKernel {
  public:
@@ -63,8 +62,7 @@ class LogspaceOpMaker : public framework::OpProtoAndCheckerMaker {
     )DOC");
   }
 };
-}  // namespace operators
-}  // namespace paddle
+}  // namespace paddle::operators
 
 namespace ops = paddle::operators;
 DECLARE_INFER_SHAPE_FUNCTOR(logspace,
diff --git a/paddle/fluid/operators/lstm_op.cc b/paddle/fluid/operators/lstm_op.cc
deleted file mode 100644
index ac5cb81c060f0..0000000000000
--- a/paddle/fluid/operators/lstm_op.cc
+++ /dev/null
@@ -1,365 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/lstm_op.h"
-
-#include <memory>
-#include <string>
-
-namespace paddle {
-namespace operators {
-
-class LSTMOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "LSTM");
-    OP_INOUT_CHECK(ctx->HasInput("Weight"), "Input", "Weight", "LSTM");
-    OP_INOUT_CHECK(ctx->HasInput("Bias"), "Input", "Bias", "LSTM");
-
-    OP_INOUT_CHECK(ctx->HasOutput("Hidden"), "Output", "Hidden", "LSTM");
-    OP_INOUT_CHECK(ctx->HasOutput("Cell"), "Output", "Cell", "LSTM");
-
-    bool is_test = ctx->Attrs().Get<bool>("is_test");
-
-    if (!is_test) {
-      OP_INOUT_CHECK(
-          ctx->HasOutput("BatchGate"), "Output", "BatchGate", "LSTM");
-      OP_INOUT_CHECK(ctx->HasOutput("BatchCellPreAct"),
-                     "Output",
-                     "BatchCellPreAct",
-                     "LSTM");
-    }
-    auto in_dims = ctx->GetInputDim("Input");
-    PADDLE_ENFORCE_EQ(
-        in_dims.size(),
-        2,
-        phi::errors::InvalidArgument(
-            "Input(X)'s rank must be 2, but received %d.", in_dims.size()));
-
-    if (ctx->HasInput("H0")) {
-      PADDLE_ENFORCE_EQ(
-          ctx->HasInput("C0"),
-          true,
-          phi::errors::NotFound("Input(Cell) and Input(Hidden) of LSTM "
-                                "should not be null at the same time."));
-      auto h_dims = ctx->GetInputDim("H0");
-      auto c_dims = ctx->GetInputDim("C0");
-      PADDLE_ENFORCE_EQ(h_dims,
-                        c_dims,
-                        phi::errors::InvalidArgument(
-                            "The dimension of Input(H0) and Input(C0) should "
-                            "be the same, but received [%s] (H0) vs [%s] (C0).",
-                            h_dims,
-                            c_dims));
-    }
-
-    int frame_size = static_cast<int>(in_dims[1] / 4);
-    auto w_dims = ctx->GetInputDim("Weight");
-    PADDLE_ENFORCE_EQ(
-        w_dims.size(),
-        2,
-        phi::errors::InvalidArgument(
-            "The rank of Input(Weight) should be 2, but received %d.",
-            w_dims.size()));
-    PADDLE_ENFORCE_EQ(w_dims[0],
-                      frame_size,
-                      phi::errors::InvalidArgument(
-                          "The first dimension of Input(Weight) should be %d, "
-                          "but received %d.",
-                          frame_size,
-                          w_dims[0]));
-    PADDLE_ENFORCE_EQ(w_dims[1],
-                      4 * frame_size,
-                      phi::errors::InvalidArgument(
-                          "The second dimension of Input(Weight) should be 4 * "
-                          "%d, but received %d.",
-                          frame_size,
-                          w_dims[1]));
-
-    auto b_dims = ctx->GetInputDim("Bias");
-    PADDLE_ENFORCE_EQ(
-        b_dims.size(),
-        2,
-        phi::errors::InvalidArgument(
-            "The rank of Input(Bias) should be 2, but received %d.",
-            b_dims.size()));
-    PADDLE_ENFORCE_EQ(
-        b_dims[0],
-        1,
-        phi::errors::InvalidArgument(
-            "The first dimension of Input(Bias) should be 1, but received %d.",
-            b_dims[0]));
-
-    if (ctx->Attrs().Get<bool>("use_peepholes")) {
-      PADDLE_ENFORCE_EQ(
-          b_dims[1],
-          7 * frame_size,
-          phi::errors::InvalidArgument(
-              "The second dimension of Input(Bias) should be 7 * %d if enable "
-              "peepholes connection, but received %d.",
-              frame_size,
-              b_dims[1]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          b_dims[1],
-          4 * frame_size,
-          phi::errors::InvalidArgument(
-              "The second dimension of Input(Bias) should be 4 * %d if disable "
-              "peepholes connection, but received %d.",
-              frame_size,
-              b_dims[1]));
-    }
-
-    phi::DDim out_dims({in_dims[0], frame_size});
-    ctx->SetOutputDim("Hidden", out_dims);
-    ctx->SetOutputDim("Cell", out_dims);
-    if (!is_test) {
-      ctx->SetOutputDim("BatchGate", in_dims);
-      ctx->SetOutputDim("BatchCellPreAct", out_dims);
-    }
-    ctx->ShareLoD("Input", "Hidden");
-    ctx->ShareLoD("Input", "Cell");
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "Input"),
-                          ctx.device_context().GetPlace());
-  }
-};
-
-class LSTMOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(
-        "Input",
-        "(phi::DenseTensor) the first input is a phi::DenseTensor, which "
-        "support variable-time length input sequence. The underlying tensor in "
-        "this phi::DenseTensor is a matrix with shape (T X 4D), where T is the "
-        "total time steps in this mini-batch, D is the hidden size.");
-    AddInput("H0",
-             "(Tensor, optional) the initial hidden state is an optional "
-             "input. This is a tensor with shape (N x D), where N is the "
-             "batch size and D is the hidden size.")
-        .AsDispensable();
-    AddInput("C0",
-             "(Tensor, optional) the initial cell state is an optional "
-             "input. This is a tensor with shape (N x D), where N is the "
-             "batch size. `H0` and `C0` can be NULL but only at the same time.")
-        .AsDispensable();
-    AddInput("Weight",
-             "(Tensor) the learnable hidden-hidden weights."
-             " - The shape is (D x 4D), where D is the hidden size. "
-             " - Weight = {W_ch, W_ih, W_fh, W_oh}");
-    AddInput("Bias",
-             "(Tensor) the learnable weights, which contains two parts: "
-             "input-hidden bias weight and peephole connections weight if "
-             "setting `use_peepholes` True. "
-             "1. `use_peepholes = False` "
-             " - The shape is (1 x 4D). "
-             " - Bias = {b_c, b_i, b_f, b_o}."
-             "2. `use_peepholes = True` "
-             " - The shape is (1 x 7D). "
-             " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}.");
-    AddOutput("Hidden",
-              "(phi::DenseTensor) the hidden state of LSTM operator. "
-              "The shape is (T x D), and lod is the same with the `Input`.");
-    AddOutput("Cell",
-              "(phi::DenseTensor) the cell state of LSTM operator. "
-              "The shape is (T x D), and lod is the same with the `Input`.");
-    AddOutput(
-        "BatchGate",
-        "(phi::DenseTensor) This phi::DenseTensor contains input gate, forget "
-        "gate "
-        "and output gate after the nonlinear computation. This "
-        "phi::DenseTensor has the same shape as the reorganized input, which "
-        "is also be called batch input. The LoD size is 2. The first "
-        "LoD is the batch offsets and the second LoD contains the "
-        "indexes, which denote the position of reorganized sequence "
-        "in the raw input.")
-        .AsIntermediate()
-        .AsExtra();
-    AddOutput("BatchCellPreAct",
-              "(phi::DenseTensor) This phi::DenseTensor is obtained in the "
-              "forward and used "
-              "in the backward.")
-        .AsIntermediate()
-        .AsExtra();
-    AddAttr<bool>("use_peepholes",
-                  "(bool, default: True) "
-                  "whether to enable diagonal/peephole connections.")
-        .SetDefault(true);
-    AddAttr<bool>("is_reverse",
-                  "(bool, default: False) "
-                  "whether to compute reversed LSTM.")
-        .SetDefault(false);
-    AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
-    AddAttr<std::string>(
-        "gate_activation",
-        "(string, default: sigmoid)"
-        "The activation for input gate, forget gate and output "
-        "gate, `sigmoid` by default.")
-        .SetDefault("sigmoid")
-        .InEnum({"sigmoid", "tanh", "relu", "identity"});
-    AddAttr<std::string>("cell_activation",
-                         "(string, default: tanh)"
-                         "The activation for cell output, `tanh` by default.")
-        .SetDefault("tanh")
-        .InEnum({"sigmoid", "tanh", "relu", "identity"});
-    AddAttr<std::string>("candidate_activation",
-                         "(string, default: tanh)"
-                         "The activation for candidate hidden state, "
-                         "`tanh` by default.")
-        .SetDefault("tanh")
-        .InEnum({"sigmoid", "tanh", "relu", "identity"});
-    AddComment(R"DOC(
-Long-Short Term Memory (LSTM) Operator.
-
-The default implementation is diagonal/peephole connection
-(https://arxiv.org/pdf/1402.1128.pdf), the formula is as follows:
-
-$$ i_t = \\sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + W_{ic}c_{t-1} + b_i) $$
-
-$$ f_t = \\sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + W_{fc}c_{t-1} + b_f) $$
-
-$$ \\tilde{c_t} = act_g(W_{cx}x_t + W_{ch}h_{t-1} + b_c) $$
-
-$$ o_t = \\sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + W_{oc}c_t + b_o) $$
-
-$$ c_t = f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} $$
-
-$$ h_t = o_t \\odot act_h(c_t) $$
-
-- W terms denote weight matrices (e.g. $W_{xi}$ is the matrix
-  of weights from the input gate to the input), $W_{ic}, W_{fc}, W_{oc}$
-  are diagonal weight matrices for peephole connections. In our implementation,
-  we use vectors to represent these diagonal weight matrices.
-- The b terms denote bias vectors ($b_i$ is the input gate bias vector).
-- $\sigma$ is the non-line activations, such as logistic sigmoid function.
-- $i, f, o$ and $c$ are the input gate, forget gate, output gate,
-  and cell activation vectors, respectively, all of which have the same size as
-  the cell output activation vector $h$.
-- The $\odot$ is the element-wise product of the vectors.
-- $act_g$ and $act_h$ are the cell input and cell output activation functions
-  and `tanh` is usually used for them.
-- $\tilde{c_t}$ is also called candidate hidden state,
-  which is computed based on the current input and the previous hidden state.
-
-Set `use_peepholes` False to disable peephole connection. The formula
-is omitted here, please refer to the paper
-http://www.bioinf.jku.at/publications/older/2604.pdf for details.
-
-Note that these $W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}$
-operations on the input $x_{t}$ are NOT included in this operator.
-Users can choose to use fully-connect operator before LSTM operator.
-
-)DOC");
-  }
-};
-
-class LSTMGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "LSTM@Grad");
-    OP_INOUT_CHECK(ctx->HasInput("Hidden"), "Input", "Hidden", "LSTM@Grad");
-    OP_INOUT_CHECK(ctx->HasInput("Cell"), "Input", "Cell", "LSTM@Grad");
-    OP_INOUT_CHECK(ctx->HasInput("Weight"), "Input", "Weight", "LSTM@Grad");
-    OP_INOUT_CHECK(ctx->HasInput("Bias"), "Input", "Bias", "LSTM@Grad");
-
-    OP_INOUT_CHECK(
-        ctx->HasInput("BatchGate"), "Input", "BatchGate", "LSTM@Grad");
-    OP_INOUT_CHECK(ctx->HasInput("BatchCellPreAct"),
-                   "Input",
-                   "BatchCellPreAct",
-                   "LSTM@Grad");
-
-    auto SetOutGradDim = [&ctx](const std::string& name) {
-      auto g_name = framework::GradVarName(name);
-      if (ctx->HasOutput(g_name))
-        ctx->SetOutputDim(g_name, ctx->GetInputDim(name));
-    };
-
-    SetOutGradDim("Input");
-    SetOutGradDim("Weight");
-    SetOutGradDim("Bias");
-    SetOutGradDim("H0");
-    SetOutGradDim("C0");
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "Input"),
-                          ctx.device_context().GetPlace());
-  }
-};
-
-template <typename T>
-class LSTMGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("lstm_grad");
-    op->SetAttrMap(this->Attrs());
-    op->SetInput("Input", this->Input("Input"));
-    op->SetOutput(framework::GradVarName("Input"), this->InputGrad("Input"));
-
-    if (this->HasInput("H0")) {
-      op->SetInput("H0", this->Input("H0"));
-      op->SetOutput(framework::GradVarName("H0"), this->InputGrad("H0"));
-    }
-
-    if (this->HasInput("C0")) {
-      op->SetInput("C0", this->Input("C0"));
-      op->SetOutput(framework::GradVarName("C0"), this->InputGrad("C0"));
-    }
-
-    op->SetInput("Weight", this->Input("Weight"));
-    op->SetOutput(framework::GradVarName("Weight"), this->InputGrad("Weight"));
-
-    op->SetInput("Bias", this->Input("Bias"));
-    op->SetOutput(framework::GradVarName("Bias"), this->InputGrad("Bias"));
-
-    op->SetInput("Cell", this->Output("Cell"));
-
-    op->SetInput("Hidden", this->Output("Hidden"));
-    op->SetInput(framework::GradVarName("Hidden"), this->OutputGrad("Hidden"));
-
-    op->SetInput("BatchGate", this->Output("BatchGate"));
-    op->SetInput("BatchCellPreAct", this->Output("BatchCellPreAct"));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(lstm,
-                  ops::LSTMOp,
-                  ops::LSTMOpMaker,
-                  ops::LSTMGradOpMaker<paddle::framework::OpDesc>,
-                  ops::LSTMGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(lstm_grad, ops::LSTMGradOp);
-
-PD_REGISTER_STRUCT_KERNEL(
-    lstm, CPU, ALL_LAYOUT, ops::LSTMKernel, float, double) {}
-PD_REGISTER_STRUCT_KERNEL(
-    lstm_grad, CPU, ALL_LAYOUT, ops::LSTMGradKernel, float, double) {}
diff --git a/paddle/fluid/operators/lstm_op.cu.cc b/paddle/fluid/operators/lstm_op.cu.cc
deleted file mode 100644
index b06521088a95a..0000000000000
--- a/paddle/fluid/operators/lstm_op.cu.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/lstm_op.h"
-
-namespace ops = paddle::operators;
-PD_REGISTER_STRUCT_KERNEL(
-    lstm, GPU, ALL_LAYOUT, ops::LSTMKernel, float, double) {}
-PD_REGISTER_STRUCT_KERNEL(
-    lstm_grad, GPU, ALL_LAYOUT, ops::LSTMGradKernel, float, double) {}
diff --git a/paddle/fluid/operators/lstm_op.h b/paddle/fluid/operators/lstm_op.h
deleted file mode 100644
index 9eaba45a2d597..0000000000000
--- a/paddle/fluid/operators/lstm_op.h
+++ /dev/null
@@ -1,444 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/funcs/detail/activation_functions.h"
-#include "paddle/phi/kernels/funcs/lstm_compute.h"
-#include "paddle/phi/kernels/funcs/sequence2batch.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-inline void ReorderInitState(const DeviceContext& ctx,
-                             const phi::DenseTensor& src,
-                             phi::Vector<size_t> index_lod,
-                             phi::DenseTensor* dst,
-                             bool indexed_src) {
-  phi::funcs::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
-  dst->mutable_data<T>(src.dims(), ctx.GetPlace());
-  row_shuffle(ctx, src, index_lod, dst, indexed_src);
-}
-
-template <typename T, typename DeviceContext>
-class LSTMKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    bool is_test = ctx.Attr<bool>("is_test");
-
-    auto* input = ctx.Input<phi::DenseTensor>("Input");
-    auto* weight = ctx.Input<phi::DenseTensor>("Weight");
-    auto* bias = ctx.Input<phi::DenseTensor>("Bias");
-
-    auto* hidden_t0 = ctx.Input<phi::DenseTensor>("H0");
-    auto* cell_t0 = ctx.Input<phi::DenseTensor>("C0");
-
-    phi::DenseTensor* batch_gate = nullptr;
-    phi::DenseTensor batch_gate_temp;
-    if (is_test) {
-      batch_gate = &batch_gate_temp;
-      batch_gate->Resize(input->dims());
-    } else {
-      batch_gate = ctx.Output<phi::DenseTensor>("BatchGate");
-    }
-    batch_gate->mutable_data<T>(ctx.GetPlace());
-    auto* hidden_out = ctx.Output<phi::DenseTensor>("Hidden");
-    hidden_out->mutable_data<T>(ctx.GetPlace());
-    auto* cell_out = ctx.Output<phi::DenseTensor>("Cell");
-    cell_out->mutable_data<T>(ctx.GetPlace());
-
-    bool is_reverse = ctx.Attr<bool>("is_reverse");
-    phi::funcs::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
-    auto& device_ctx = ctx.template device_context<DeviceContext>();
-    to_batch(device_ctx, *input, batch_gate, true, is_reverse);
-
-    auto in_dims = input->dims();
-    int frame_size = static_cast<int>(in_dims[1] / 4);
-    phi::DDim dims({in_dims[0], frame_size});
-
-    if (bias) {
-      phi::DenseTensor b = *bias;
-      b.Resize({bias->numel(), 1});
-      phi::DenseTensor gate_bias = b.Slice(0, 4 * frame_size);
-      phi::funcs::RowwiseAdd<DeviceContext, T> add_bias;
-      add_bias(device_ctx, *batch_gate, gate_bias, batch_gate);
-    }
-
-    phi::funcs::LstmMetaValue<T> lstm_value;
-    if (bias && ctx.Attr<bool>("use_peepholes")) {
-      T* bias_data = const_cast<T*>(bias->data<T>());
-      // the code style in LstmMetaValue will be updated later.
-
-      lstm_value.check_ig = bias_data + 4 * frame_size;
-      lstm_value.check_fg = lstm_value.check_ig + frame_size;
-      lstm_value.check_og = lstm_value.check_fg + frame_size;
-    } else {
-      lstm_value.check_ig = nullptr;
-      lstm_value.check_fg = nullptr;
-      lstm_value.check_og = nullptr;
-    }
-    lstm_value.prev_state_value = nullptr;
-    phi::DenseTensor ordered_c0;
-
-    phi::Vector<size_t> order(batch_gate->lod()[2]);
-
-    if (cell_t0) {
-      // Since the batch computing for LSTM reorders the input sequence
-      // according to their length. The initialized cell state also needs
-      // to reorder.
-      ReorderInitState<DeviceContext, T>(
-          device_ctx, *cell_t0, order, &ordered_c0, true);
-      lstm_value.prev_state_value = ordered_c0.data<T>();
-    }
-
-    // Use the local variable as here.
-    phi::DenseTensor batch_hidden, batch_cell, batch_cell_pre_act_temp;
-    phi::DenseTensor* batch_cell_pre_act;
-    if (is_test) {
-      batch_cell_pre_act = &batch_cell_pre_act_temp;
-    } else {
-      batch_cell_pre_act = ctx.Output<phi::DenseTensor>("BatchCellPreAct");
-    }
-    batch_hidden.mutable_data<T>(dims, ctx.GetPlace());
-    batch_cell.mutable_data<T>(dims, ctx.GetPlace());
-    batch_cell_pre_act->mutable_data<T>(dims, ctx.GetPlace());
-
-    auto batch_starts = batch_gate->lod()[0];
-    size_t num_batch = batch_starts.size() - 1;
-    auto gate_act = phi::funcs::detail::GetActivationType(
-        ctx.Attr<std::string>("gate_activation"));
-    auto cell_act = phi::funcs::detail::GetActivationType(
-        ctx.Attr<std::string>("cell_activation"));
-    auto cand_act = phi::funcs::detail::GetActivationType(
-        ctx.Attr<std::string>("candidate_activation"));
-
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(device_ctx);
-    for (size_t n = 0; n < num_batch; n++) {
-      int bstart = static_cast<int>(batch_starts[n]);
-      int bend = static_cast<int>(batch_starts[n + 1]);
-
-      phi::DenseTensor gate_t = batch_gate->Slice(bstart, bend);
-      phi::DenseTensor out_t = batch_hidden.Slice(bstart, bend);
-      phi::DenseTensor cell_t = batch_cell.Slice(bstart, bend);
-      phi::DenseTensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend);
-
-      int cur_batch_size = bend - bstart;
-
-      if (n > 0) {
-        int pre_h_start = static_cast<int>(batch_starts[n - 1]);
-        int pre_h_end = pre_h_start + cur_batch_size;
-        auto pre_hidden_t = batch_hidden.Slice(pre_h_start, pre_h_end);
-        blas.MatMul(pre_hidden_t,
-                    false,
-                    *weight,
-                    false,
-                    static_cast<T>(1.0),
-                    &gate_t,
-                    static_cast<T>(1.0));
-      } else if (hidden_t0) {
-        // If n == 0 and there is no initialized hidden state, that is to say
-        // the H0 is zeros, the calculation W_h * H0 will be skiped.
-        // If n == 0 and there is initialized hidden state, calculate W_h * H0.
-
-        // Since the batch computing for LSTM reorders the input sequence
-        // according to their length. The initialized hidden state also needs
-        // to reorder.
-        phi::DenseTensor ordered_h0;
-        ReorderInitState<DeviceContext, T>(
-            device_ctx, *hidden_t0, order, &ordered_h0, true);
-        blas.MatMul(ordered_h0,
-                    false,
-                    *weight,
-                    false,
-                    static_cast<T>(1.0),
-                    &gate_t,
-                    static_cast<T>(1.0));
-      }
-
-      lstm_value.gate_value = gate_t.data<T>();
-      lstm_value.output_value = out_t.data<T>();
-      lstm_value.state_value = cell_t.data<T>();
-      lstm_value.state_active_value = cell_pre_act_t.data<T>();
-      T cell_clip = 0.0;
-      phi::funcs::LstmUnitFunctor<DeviceContext, T>::compute(device_ctx,
-                                                             lstm_value,
-                                                             frame_size,
-                                                             cur_batch_size,
-                                                             cell_clip,
-                                                             gate_act,
-                                                             cell_act,
-                                                             cand_act);
-      lstm_value.prev_state_value = lstm_value.state_value;
-    }
-
-    phi::funcs::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
-    batch_hidden.set_lod(batch_gate->lod());
-    // restore the output hidden in phi::DenseTensor from the batch hidden
-    to_seq(device_ctx, batch_hidden, hidden_out);
-
-    batch_cell.set_lod(batch_gate->lod());
-    // restore the output cell state in phi::DenseTensor from the batch cell
-    to_seq(device_ctx, batch_cell, cell_out);
-  }
-};
-
-template <typename T, typename DeviceContext>
-class LSTMGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("Input");
-    auto* weight = ctx.Input<phi::DenseTensor>("Weight");
-    auto* bias = ctx.Input<phi::DenseTensor>("Bias");
-
-    auto* hidden_out = ctx.Input<phi::DenseTensor>("Hidden");
-    auto* cell_out = ctx.Input<phi::DenseTensor>("Cell");
-
-    auto* batch_gate = ctx.Input<phi::DenseTensor>("BatchGate");
-    auto* batch_cell_pre_act = ctx.Input<phi::DenseTensor>("BatchCellPreAct");
-
-    auto* hidden_g =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Hidden"));
-
-    auto* in_g = ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
-    auto* weight_g =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Weight"));
-    auto* bias_g = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
-
-    auto* h0 = ctx.Input<phi::DenseTensor>("H0");
-    auto* c0 = ctx.Input<phi::DenseTensor>("C0");
-
-    auto* h0_g = ctx.Output<phi::DenseTensor>(framework::GradVarName("H0"));
-    auto* c0_g = ctx.Output<phi::DenseTensor>(framework::GradVarName("C0"));
-
-    auto& device_ctx = ctx.template device_context<DeviceContext>();
-    phi::funcs::SetConstant<DeviceContext, T> zero;
-    if (weight_g) {
-      weight_g->mutable_data<T>(ctx.GetPlace());
-      zero(device_ctx, weight_g, static_cast<T>(0.0));
-    }
-
-    // ordered_h0/c0 is the reordered hidden/cell initialization.
-    // ordered_h0_g/c0_g is the reordered gradient of hidden/cell
-    // initialization.
-    phi::DenseTensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g;
-    phi::Vector<size_t> order(batch_gate->lod()[2]);
-
-    if (c0) {
-      ReorderInitState<DeviceContext, T>(
-          device_ctx, *c0, order, &ordered_c0, true);
-    }
-    if (c0 && c0_g) {
-      ordered_c0_g.mutable_data<T>(c0_g->dims(), ctx.GetPlace());
-    }
-
-    auto in_dims = input->dims();
-    auto out_dims = hidden_g->dims();
-    int frame_size = static_cast<int>(in_dims[1] / 4);
-    PADDLE_ENFORCE_EQ(
-        frame_size,
-        out_dims[1],
-        phi::errors::InvalidArgument(
-            "The second dimension of Input(" +
-                framework::GradVarName("Hidden") +
-                ") should be %d, but received %d in LSTM@Grad operator.",
-            frame_size,
-            out_dims[1]));
-
-    phi::funcs::LstmMetaValue<T> lstm_value;
-    if (bias && ctx.Attr<bool>("use_peepholes")) {
-      T* bias_data = const_cast<T*>(bias->data<T>());
-      lstm_value.check_ig = bias_data + 4 * frame_size;
-      lstm_value.check_fg = lstm_value.check_ig + frame_size;
-      lstm_value.check_og = lstm_value.check_fg + frame_size;
-    } else {
-      lstm_value.check_ig = nullptr;
-      lstm_value.check_fg = nullptr;
-      lstm_value.check_og = nullptr;
-    }
-
-    phi::funcs::LstmMetaGrad<T> lstm_grad;
-
-    if (bias && bias_g) {
-      bias_g->mutable_data<T>(ctx.GetPlace());
-      zero(device_ctx, bias_g, static_cast<T>(0.0));
-    }
-    if (bias && bias_g && ctx.Attr<bool>("use_peepholes")) {
-      T* bias_g_data = bias_g->data<T>();
-      lstm_grad.check_ig_grad = bias_g_data + 4 * frame_size;
-      lstm_grad.check_fg_grad = lstm_grad.check_ig_grad + frame_size;
-      lstm_grad.check_og_grad = lstm_grad.check_fg_grad + frame_size;
-    } else {
-      lstm_grad.check_ig_grad = nullptr;
-      lstm_grad.check_fg_grad = nullptr;
-      lstm_grad.check_og_grad = nullptr;
-    }
-
-    phi::funcs::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
-
-    auto ToBatch = [&batch_gate, &to_batch](const DeviceContext& ctx,
-                                            const phi::DenseTensor& src,
-                                            const phi::DDim& dims,
-                                            phi::DenseTensor& dst) {
-      dst.mutable_data<T>(dims, ctx.GetPlace());
-      dst.set_lod(batch_gate->lod());
-      to_batch(ctx, src, &dst, false);
-    };
-
-    phi::DenseTensor batch_hidden, batch_hidden_g, batch_cell;
-    ToBatch(device_ctx, *hidden_out, out_dims, batch_hidden);
-    ToBatch(device_ctx, *hidden_g, out_dims, batch_hidden_g);
-    ToBatch(device_ctx, *cell_out, out_dims, batch_cell);
-
-    phi::DenseTensor batch_cell_g, batch_gate_g;
-    batch_cell_g.mutable_data<T>(out_dims, ctx.GetPlace());
-    // TODO(qingqing) support the case output cell has gradient.
-    // to_batch(device_ctx, *cell_g, batch_cell_g, false);
-    zero(device_ctx, &batch_cell_g, static_cast<T>(0.0));
-    batch_gate_g.mutable_data<T>(batch_gate->dims(), ctx.GetPlace());
-    batch_gate_g.set_lod(batch_gate->lod());
-
-    auto gate_act = phi::funcs::detail::GetActivationType(
-        ctx.Attr<std::string>("gate_activation"));
-    auto cell_act = phi::funcs::detail::GetActivationType(
-        ctx.Attr<std::string>("cell_activation"));
-    auto cand_act = phi::funcs::detail::GetActivationType(
-        ctx.Attr<std::string>("candidate_activation"));
-
-    auto batch_starts = batch_gate->lod()[0];
-    size_t num_batch = batch_starts.size() - 1;
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(device_ctx);
-    for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) {
-      int bstart = static_cast<int>(batch_starts[n]);
-      int bend = static_cast<int>(batch_starts[n + 1]);
-
-      phi::DenseTensor gate = batch_gate->Slice(bstart, bend);
-      phi::DenseTensor cell = batch_cell.Slice(bstart, bend);
-      phi::DenseTensor cell_pre_act = batch_cell_pre_act->Slice(bstart, bend);
-      lstm_value.gate_value = gate.data<T>();
-      lstm_value.state_value = cell.data<T>();
-      lstm_value.state_active_value = cell_pre_act.data<T>();
-
-      phi::DenseTensor out_g = batch_hidden_g.Slice(bstart, bend);
-      phi::DenseTensor gate_g = batch_gate_g.Slice(bstart, bend);
-      phi::DenseTensor cell_g = batch_cell_g.Slice(bstart, bend);
-      lstm_grad.state_grad = cell_g.data<T>();
-      lstm_grad.gate_grad = gate_g.data<T>();
-      lstm_grad.output_grad = out_g.data<T>();
-
-      if (n > 0) {
-        int bstart_pre = static_cast<int>(batch_starts[n - 1]);
-        phi::DenseTensor cell_pre = batch_cell.Slice(bstart_pre, bstart);
-        phi::DenseTensor cell_pre_g = batch_cell_g.Slice(bstart_pre, bstart);
-        lstm_value.prev_state_value = cell_pre.data<T>();
-        lstm_grad.prev_state_grad = cell_pre_g.data<T>();
-      } else {
-        lstm_value.prev_state_value = c0 ? ordered_c0.data<T>() : nullptr;
-        lstm_grad.prev_state_grad = c0_g ? ordered_c0_g.data<T>() : nullptr;
-      }
-
-      // lstm_value.output_value not used in bp, set to nullptr
-      // lstm_grad.state_active_grad not used in bp, set to nullptr
-      lstm_value.output_value = nullptr;
-      lstm_grad.state_active_grad = nullptr;
-      int cur_batch_size = bend - bstart;
-      T cell_clip = 0.0;
-      phi::funcs::LstmUnitGradFunctor<DeviceContext, T>::compute(device_ctx,
-                                                                 lstm_value,
-                                                                 lstm_grad,
-                                                                 frame_size,
-                                                                 cur_batch_size,
-                                                                 cell_clip,
-                                                                 gate_act,
-                                                                 cell_act,
-                                                                 cand_act);
-
-      if (n > 0) {
-        int pre_h_start = static_cast<int>(batch_starts[n - 1]);
-        int pre_h_end = pre_h_start + cur_batch_size;
-        auto pre_hidden_g = batch_hidden_g.Slice(pre_h_start, pre_h_end);
-        blas.MatMul(gate_g,
-                    false,
-                    *weight,
-                    true,
-                    static_cast<T>(1.0),
-                    &pre_hidden_g,
-                    static_cast<T>(1.0));
-        if (weight_g) {
-          /* backward weight */
-          auto pre_hidden = batch_hidden.Slice(pre_h_start, pre_h_end);
-          blas.MatMul(pre_hidden,
-                      true,
-                      gate_g,
-                      false,
-                      static_cast<T>(1.0),
-                      weight_g,
-                      static_cast<T>(1.0));
-        }
-      } else {
-        if (h0 && weight_g) {
-          ReorderInitState<DeviceContext, T>(
-              device_ctx, *h0, order, &ordered_h0, true);
-          blas.MatMul(ordered_h0,
-                      true,
-                      gate_g,
-                      false,
-                      static_cast<T>(1.0),
-                      weight_g,
-                      static_cast<T>(1.0));
-        }
-        if (h0 && h0_g) {
-          ordered_h0_g.mutable_data<T>(h0_g->dims(), ctx.GetPlace());
-          blas.MatMul(gate_g,
-                      false,
-                      *weight,
-                      true,
-                      static_cast<T>(1.0),
-                      &ordered_h0_g,
-                      static_cast<T>(0.0));
-        }
-      }
-    }
-
-    phi::funcs::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
-    if (in_g) {
-      /* backward data */
-      in_g->mutable_data<T>(ctx.GetPlace());
-      to_seq(device_ctx, batch_gate_g, in_g);
-    }
-    if (bias && bias_g) {
-      /* backward bias */
-      phi::DenseTensor b_g = *bias_g;
-      b_g.Resize({bias_g->numel(), 1});
-      phi::DenseTensor gate_bias_g = b_g.Slice(0, 4 * frame_size);
-      phi::funcs::ColwiseSum<DeviceContext, T> col_sum;
-      col_sum(device_ctx, batch_gate_g, &gate_bias_g);
-    }
-
-    if (h0 && h0_g) {
-      ReorderInitState<DeviceContext, T>(
-          device_ctx, ordered_h0_g, order, h0_g, false);
-    }
-    if (c0 && c0_g) {
-      ReorderInitState<DeviceContext, T>(
-          device_ctx, ordered_c0_g, order, c0_g, false);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/memcpy_h2d_op.cc b/paddle/fluid/operators/memcpy_h2d_op.cc
index 85cd21831c9b1..b06c7a01a718a 100644
--- a/paddle/fluid/operators/memcpy_h2d_op.cc
+++ b/paddle/fluid/operators/memcpy_h2d_op.cc
@@ -18,20 +18,17 @@ limitations under the License. */
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/unary.h"
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 class OpDesc;
 class InferShapeContext;
 template <typename T>
 class EmptyGradOpMaker;
-}  // namespace framework
-namespace imperative {
+}  // namespace paddle::framework
+namespace paddle::imperative {
 class OpBase;
-}  // namespace imperative
-}  // namespace paddle
+}  // namespace paddle::imperative
 
-namespace paddle {
-namespace operators {
+namespace paddle::operators {
 
 class MemcpyH2DOp : public framework::OperatorWithKernel {
  public:
@@ -104,8 +101,7 @@ raise error if the type is not listed above.
   }
 };
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace paddle::operators
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/fluid/operators/nop_op.cc b/paddle/fluid/operators/nop_op.cc
index 2c1486636561b..baf6fbbcf8661 100644
--- a/paddle/fluid/operators/nop_op.cc
+++ b/paddle/fluid/operators/nop_op.cc
@@ -15,8 +15,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 
-namespace paddle {
-namespace operators {
+namespace paddle::operators {
 
 class NopOp : public framework::OperatorWithKernel {
  public:
@@ -45,8 +44,7 @@ establish the dependency between input and output tensors.
   }
 };
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace paddle::operators
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/fluid/operators/number_count_op.cc b/paddle/fluid/operators/number_count_op.cc
deleted file mode 100644
index 7fb293891d3a5..0000000000000
--- a/paddle/fluid/operators/number_count_op.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/number_count_op.h"
-
-namespace paddle {
-namespace operators {
-
-class NumberCountOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("numbers"), "Input", "numbers", "NumberCount");
-    OP_INOUT_CHECK(
-        ctx->HasOutput("Out"), "Output", "number_count", "NumberCount");
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    // the dtype of the numbers should be same as int64
-    auto number_dtype = OperatorWithKernel::IndicateVarDataType(ctx, "numbers");
-
-    PADDLE_ENFORCE_EQ(number_dtype,
-                      framework::proto::VarType::INT64,
-                      phi::errors::InvalidArgument(
-                          "The dtype of the number_dtype should be int64"));
-    return phi::KernelKey(number_dtype, ctx.GetPlace());
-  }
-};
-
-class NumberCountOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("numbers", "(Tensor) The input gate index tensor.");
-    AddOutput("Out", "(Tensor) The output number count tensor.");
-    AddAttr<int>("upper_range", "(int), The number of different numbers.");
-
-    AddComment(R"DOC(number_count Operator.count numbers.)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_WITHOUT_GRADIENT(number_count,
-                             ops::NumberCountOp,
-                             ops::NumberCountOpMaker);
diff --git a/paddle/fluid/operators/number_count_op.h b/paddle/fluid/operators/number_count_op.h
deleted file mode 100644
index 12ad10c3e73cc..0000000000000
--- a/paddle/fluid/operators/number_count_op.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-#if defined(PADDLE_WITH_GLOO)
-#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename DeviceContext>
-class NumberCountOpCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {}
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/ops_signature/assign_pos_sig.cc b/paddle/fluid/operators/ops_signature/assign_pos_sig.cc
deleted file mode 100644
index 010d164d83dae..0000000000000
--- a/paddle/fluid/operators/ops_signature/assign_pos_sig.cc
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature AssignPosOpArgumentMapping(
-    const ArgumentMappingContext& ctx UNUSED) {
-  return KernelSignature(
-      "assign_pos", {"X", "cum_count", "eff_num_len"}, {}, {"Out"});
-}
-
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(assign_pos, phi::AssignPosOpArgumentMapping);
diff --git a/paddle/fluid/operators/ops_signature/decayed_adagrad_sig.cc b/paddle/fluid/operators/ops_signature/decayed_adagrad_sig.cc
deleted file mode 100644
index d622a8a342789..0000000000000
--- a/paddle/fluid/operators/ops_signature/decayed_adagrad_sig.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature DecayedAdagradOpArgumentMapping(
-    const ArgumentMappingContext& ctx UNUSED) {
-  return KernelSignature("decayed_adagrad",
-                         {"Param", "Grad", "Moment", "LearningRate"},
-                         {"decay", "epsilon"},
-                         {"ParamOut", "MomentOut"});
-}
-
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(decayed_adagrad,
-                           phi::DecayedAdagradOpArgumentMapping);
diff --git a/paddle/fluid/operators/ops_signature/fusion_group_sig.cc b/paddle/fluid/operators/ops_signature/fusion_group_sig.cc
deleted file mode 100644
index 666e6f77d218f..0000000000000
--- a/paddle/fluid/operators/ops_signature/fusion_group_sig.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature FusionGroupOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature("fusion_group",
-                         {"Inputs"},
-                         {"outs_dtype", "inputs_dtype", "func_name", "type"},
-                         {"Outs"});
-}
-
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(fusion_group, phi::FusionGroupOpArgumentMapping);
diff --git a/paddle/fluid/operators/ops_signature/prune_gate_by_capacity_sig.cc b/paddle/fluid/operators/ops_signature/prune_gate_by_capacity_sig.cc
deleted file mode 100644
index b8bf7248cd701..0000000000000
--- a/paddle/fluid/operators/ops_signature/prune_gate_by_capacity_sig.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature PruneGateByCapacityOpArgumentMapping(
-    const ArgumentMappingContext& ctx UNUSED) {
-  return KernelSignature("prune_gate_by_capacity",
-                         {"GateIdx", "ExpertCount"},
-                         {"n_expert", "n_worker"},
-                         {"NewGateIdx"});
-}
-
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(prune_gate_by_capacity,
-                           phi::PruneGateByCapacityOpArgumentMapping);
diff --git a/paddle/fluid/operators/ops_signature/rrelu_sig.cc b/paddle/fluid/operators/ops_signature/rrelu_sig.cc
deleted file mode 100644
index 18bda743e3255..0000000000000
--- a/paddle/fluid/operators/ops_signature/rrelu_sig.cc
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature RReluOpArgumentMapping(
-    const ArgumentMappingContext& ctx UNUSED) {
-  return KernelSignature(
-      "rrelu", {"X"}, {"lower", "upper", "is_test"}, {"Out", "Noise"});
-}
-
-KernelSignature RReluGradGradOpArgumentMapping(
-    const ArgumentMappingContext& ctx UNUSED) {
-  return KernelSignature(
-      "rrelu_grad", {"X", "Noise", "Out@GRAD"}, {}, {"X@GRAD"});
-}
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(rrelu, phi::RReluOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(rrelu_grad, phi::RReluGradGradOpArgumentMapping);
diff --git a/paddle/fluid/operators/ops_signature/shuffle_batch_sig.cc b/paddle/fluid/operators/ops_signature/shuffle_batch_sig.cc
deleted file mode 100644
index 22a9f76d95dd3..0000000000000
--- a/paddle/fluid/operators/ops_signature/shuffle_batch_sig.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature ShuffleBatchOpArgumentMapping(
-    const ArgumentMappingContext& ctx UNUSED) {
-  return KernelSignature("shuffle_batch",
-                         {"X", "Seed"},
-                         {"startup_seed"},
-                         {"Out", "ShuffleIdx", "SeedOut"});
-}
-
-KernelSignature ShuffleBatchGradOpArgumentMapping(
-    const ArgumentMappingContext& ctx UNUSED) {
-  return KernelSignature("shuffle_batch_grad",
-                         {"ShuffleIdx", "Out@GRAD"},
-                         {"startup_seed"},
-                         {"X@GRAD"});
-}
-
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(shuffle_batch, phi::ShuffleBatchOpArgumentMapping);
-
-PD_REGISTER_ARG_MAPPING_FN(shuffle_batch_grad,
-                           phi::ShuffleBatchGradOpArgumentMapping);
diff --git a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
deleted file mode 100644
index 23441206a55c1..0000000000000
--- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
+++ /dev/null
@@ -1,141 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-#include "paddle/phi/infermeta/multiary.h"
-
-namespace paddle {
-namespace operators {
-
-class DecayedAdagradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(
-        ctx->HasInput("Param"), "Input", "Param", "DecayedAdagradOp");
-    OP_INOUT_CHECK(ctx->HasInput("Grad"), "Input", "Grad", "DecayedAdagradOp");
-    OP_INOUT_CHECK(
-        ctx->HasInput("Moment"), "Input", "Moment", "DecayedAdagradOp");
-    OP_INOUT_CHECK(ctx->HasInput("LearningRate"),
-                   "Input",
-                   "LearningRate",
-                   "DecayedAdagradOp");
-    PADDLE_ENFORCE_EQ(ctx->GetInputsVarType("Param").front(),
-                      framework::proto::VarType::LOD_TENSOR,
-                      phi::errors::InvalidArgument(
-                          "The input var's type should be phi::DenseTensor, "
-                          "but the received is %s",
-                          ctx->Inputs("Param").front(),
-                          ctx->GetInputsVarType("Param").front()));
-    PADDLE_ENFORCE_EQ(ctx->GetInputsVarType("Grad").front(),
-                      framework::proto::VarType::LOD_TENSOR,
-                      phi::errors::InvalidArgument(
-                          "The input var's type should be phi::DenseTensor, "
-                          "but the received is %s",
-                          ctx->Inputs("Grad").front(),
-                          ctx->GetInputsVarType("Grad").front()));
-
-    OP_INOUT_CHECK(
-        ctx->HasOutput("ParamOut"), "Output", "ParamOut", "DecayedAdagradOp");
-    OP_INOUT_CHECK(
-        ctx->HasOutput("MomentOut"), "Output", "MomentOut", "DecayedAdagradOp");
-
-    auto lr_dims = ctx->GetInputDim("LearningRate");
-    PADDLE_ENFORCE_NE(common::product(lr_dims),
-                      0,
-                      phi::errors::InvalidArgument(
-                          "Maybe the Input variable LearningRate has not "
-                          "been initialized. You may need to confirm "
-                          "if you put exe.run(startup_program) "
-                          "after optimizer.minimize function."));
-    PADDLE_ENFORCE_EQ(
-        common::product(lr_dims),
-        1,
-        phi::errors::InvalidArgument("LearningRate should have one element"));
-    auto param_dims = ctx->GetInputDim("Param");
-    PADDLE_ENFORCE_EQ(
-        param_dims,
-        ctx->GetInputDim("Grad"),
-        phi::errors::InvalidArgument(
-            "Param and Grad input of DecayedAdagradOp should have "
-            "the same dimension."));
-    PADDLE_ENFORCE_EQ(
-        param_dims,
-        ctx->GetInputDim("Moment"),
-        phi::errors::InvalidArgument(
-            "Param and Moment input of DecayedAdagradOp should have "
-            "the same dimension."));
-
-    ctx->SetOutputDim("ParamOut", param_dims);
-    ctx->SetOutputDim("MomentOut", param_dims);
-  }
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "Param"),
-                          ctx.GetPlace());
-  }
-};
-
-class DecayedAdagradOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Param", "(Tensor) Input parameter");
-    AddInput("Grad", "(Tensor) Input gradient");
-    AddInput("Moment", "(Tensor) Second moment");
-    AddInput("LearningRate", "(Tensor) Learning rate");
-
-    AddOutput("ParamOut", "(Tensor) Output parameter");
-    AddOutput("MomentOut", "(Tensor) Output second moment");
-
-    AddAttr<float>("decay",
-                   "(float, default 0.95) "
-                   "Discounting factor for coming gradient")
-        .SetDefault(0.95);
-    AddAttr<float>("epsilon",
-                   "(float, default 1.0e-6) "
-                   "Constant for numerical stability")
-        .SetDefault(1.0e-6f);
-    AddComment(R"DOC(
-Decayed Adagrad Optimizer.
-
-The update is done as follows:
-
-$$
-moment\_out = decay * moment + (1 - decay) * grad * grad \\
-param\_out = param - \frac{learning\_rate * grad}{\sqrt{moment\_out} + epsilon}
-$$
-
-The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
-does not have an epsilon attribute. It is added here for numerical
-stability to avoid the division by zero error.
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-DECLARE_INFER_SHAPE_FUNCTOR(decayed_adagrad,
-                            DecayedAdagradShapeFunctor,
-                            PD_INFER_META(phi::DecayedAdagradInferMeta));
-
-REGISTER_OP_WITHOUT_GRADIENT(decayed_adagrad,
-                             ops::DecayedAdagradOp,
-                             ops::DecayedAdagradOpMaker,
-                             DecayedAdagradShapeFunctor);
diff --git a/paddle/fluid/operators/optimizers/sparse_momentum_op.cc b/paddle/fluid/operators/optimizers/sparse_momentum_op.cc
index 7ef426cedad19..1385d039d932b 100644
--- a/paddle/fluid/operators/optimizers/sparse_momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/sparse_momentum_op.cc
@@ -16,8 +16,7 @@
 
 #include "paddle/fluid/framework/op_version_registry.h"
 
-namespace paddle {
-namespace operators {
+namespace paddle::operators {
 
 class SparseMomentumOpInferVarType : public framework::VarTypeInference {
  public:
@@ -107,8 +106,7 @@ else:   \\
 )DOC");
 }
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace paddle::operators
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(
diff --git a/paddle/fluid/operators/prune_gate_by_capacity_op.cc b/paddle/fluid/operators/prune_gate_by_capacity_op.cc
deleted file mode 100644
index 4e4bc4d291d68..0000000000000
--- a/paddle/fluid/operators/prune_gate_by_capacity_op.cc
+++ /dev/null
@@ -1,128 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-
-class PruneGateByCapacityOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(
-        ctx->HasInput("GateIdx"), "Input", "GateIdx", "prun_gate_by_capacity");
-    OP_INOUT_CHECK(ctx->HasInput("ExpertCount"),
-                   "Input",
-                   "ExpertCount",
-                   "prun_gate_by_capacity");
-
-    OP_INOUT_CHECK(ctx->HasOutput("NewGateIdx"),
-                   "Output",
-                   "NewGateIdx",
-                   "prun_gate_by_capacity");
-    // OP_INOUT_CHECK(ctx->HasOutput("ExpertCountOut"), "Output",
-    // "ExpertCountOut",
-    //                "prun_gate_by_capacity");
-    // auto gate_idx_dims = ctx->GetInputDim("GateIdx");
-    auto expert_count_dims = ctx->GetInputDim("ExpertCount");
-
-    int64_t n_expert = ctx->Attrs().Get<int64_t>("n_expert");
-    int64_t n_worker = ctx->Attrs().Get<int64_t>("n_worker");
-
-    int64_t expert_count_num_ele = 1;
-    for (int i = 0; i < static_cast<int>(expert_count_dims.size()); i++) {
-      expert_count_num_ele *= expert_count_dims[i];
-    }
-
-    PADDLE_ENFORCE_EQ(
-        expert_count_num_ele,
-        n_expert * n_worker,
-        phi::errors::Unavailable(
-            "The number of elements for expert_count is ( %ld ) incorrect. "
-            "Because the number of expert_count must equal the "
-            "product of n_worker ( %ld ) and n_expert ( %ld ). "
-            "Please input appropriate expert_count again!",
-            expert_count_num_ele,
-            n_worker,
-            n_expert));
-
-    auto gate_idx_in_dims = ctx->GetInputDim("GateIdx");
-    // auto expert_count_in_dims = ctx->GetInputDim("ExpertCount");
-    ctx->SetOutputDim("NewGateIdx", gate_idx_in_dims);
-    // ctx->SetOutputDim("ExpertCountOut", expert_count_in_dims);
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto gate_idx_data_type =
-        OperatorWithKernel::IndicateVarDataType(ctx, "GateIdx");
-    auto expert_count_data_type =
-        OperatorWithKernel::IndicateVarDataType(ctx, "ExpertCount");
-    PADDLE_ENFORCE_EQ(
-        gate_idx_data_type,
-        expert_count_data_type,
-        phi::errors::InvalidArgument(
-            "The dtype of the gate_idx and expert_count should be same"));
-    PADDLE_ENFORCE_EQ(gate_idx_data_type,
-                      framework::proto::VarType::INT64,
-                      phi::errors::InvalidArgument(
-                          "The dtype of the gate_idx and expert_count should "
-                          "be same as int64"));
-    return phi::KernelKey(gate_idx_data_type, ctx.GetPlace());
-  }
-};
-
-class PruneGateByCapacityOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("GateIdx",
-             "(Tensor), The gate_id sequence corresponding to the input data.");
-    AddInput("ExpertCount",
-             "(Tensor), The quantity value counted on the gate_id sequence of "
-             "the input data.");
-    AddAttr<int64_t>("n_expert", "The number of Experts on each worker")
-        .SetDefault(0);
-    AddAttr<int64_t>("n_worker", "The number of workers on the trainer")
-        .SetDefault(0);
-
-    AddOutput("NewGateIdx",
-              "(Tensor), The gate_id sequence corresponding to the new input "
-              "data after passing through prune.");
-    // AddOutput(
-    //     "ExpertCountOut",
-    //     "(Tensor), The copy quantity value counted on the gate_id sequence of
-    //     "
-    //     "the input data.");
-
-    AddComment(R"DOC(
-prune_gate_by_capacity Operator.
-
-This operator is used to prune gate by capacity(CUDA).
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_WITHOUT_GRADIENT(prune_gate_by_capacity,
-                             ops::PruneGateByCapacityOp,
-                             ops::PruneGateByCapacityOpMaker);
diff --git a/paddle/fluid/operators/pscore/fetch_barrier_op.cc b/paddle/fluid/operators/pscore/fetch_barrier_op.cc
index 1928464acb9df..22c75971c7cb8 100644
--- a/paddle/fluid/operators/pscore/fetch_barrier_op.cc
+++ b/paddle/fluid/operators/pscore/fetch_barrier_op.cc
@@ -13,22 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 class InferShapeContext;
 class OpDesc;
 class Scope;
 template <typename T>
 class EmptyGradOpMaker;
-}  // namespace framework
-namespace imperative {
+}  // namespace paddle::framework
+namespace paddle::imperative {
 class OpBase;
-}  // namespace imperative
+}  // namespace paddle::imperative
 
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
+namespace paddle::operators {
 
 class FetchBarrierOp : public framework::OperatorBase {
  public:
@@ -72,8 +68,7 @@ class FetchBarrierOpShapeInference : public framework::InferShapeBase {
   void operator()(framework::InferShapeContext* ctx) const override {}
 };
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace paddle::operators
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/fluid/operators/pscore/send_op.cc b/paddle/fluid/operators/pscore/send_op.cc
index e8908758c1875..ccecf85b4a495 100644
--- a/paddle/fluid/operators/pscore/send_op.cc
+++ b/paddle/fluid/operators/pscore/send_op.cc
@@ -16,21 +16,18 @@ limitations under the License. */
 #include "paddle/fluid/distributed/ps/wrapper/fleet.h"
 #include "paddle/fluid/framework/op_registry.h"
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 class InferShapeContext;
 class OpDesc;
 class Scope;
 template <typename T>
 class EmptyGradOpMaker;
-}  // namespace framework
-namespace imperative {
+}  // namespace paddle::framework
+namespace paddle::imperative {
 class OpBase;
-}  // namespace imperative
-}  // namespace paddle
+}  // namespace paddle::imperative
 
-namespace paddle {
-namespace operators {
+namespace paddle::operators {
 
 class SendOp : public framework::OperatorBase {
  public:
@@ -102,8 +99,7 @@ class SendOpShapeInference : public framework::InferShapeBase {
   void operator()(framework::InferShapeContext* ctx) const override {}
 };
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace paddle::operators
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/fluid/operators/read_file_op.cc b/paddle/fluid/operators/read_file_op.cc
index 0f17ca063d8e6..ed0eb9f786503 100644
--- a/paddle/fluid/operators/read_file_op.cc
+++ b/paddle/fluid/operators/read_file_op.cc
@@ -24,8 +24,7 @@
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/nullary.h"
 
-namespace paddle {
-namespace operators {
+namespace paddle::operators {
 
 class ReadFileOp : public framework::OperatorWithKernel {
  public:
@@ -50,8 +49,7 @@ This operator read a file.
   }
 };
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace paddle::operators
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
index f96a0c2679c25..6973b03f56853 100644
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -15,9 +15,7 @@
 #include "paddle/fluid/operators/reader/buffered_reader.h"
 #include "paddle/fluid/operators/reader/reader_op_registry.h"
 
-namespace paddle {
-namespace operators {
-namespace reader {
+namespace paddle::operators::reader {
 class CreateDoubleBufferReaderOp : public framework::OperatorBase {
  public:
   using framework::OperatorBase::OperatorBase;
@@ -89,9 +87,7 @@ class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase {
   }
 };
 
-}  // namespace reader
-}  // namespace operators
-}  // namespace paddle
+}  // namespace paddle::operators::reader
 
 namespace ops = paddle::operators::reader;
 REGISTER_DECORATED_READER_OPERATOR(create_double_buffer_reader,
diff --git a/paddle/fluid/operators/reader/py_reader.cc b/paddle/fluid/operators/reader/py_reader.cc
index d863d759333b6..35f5949169b1d 100644
--- a/paddle/fluid/operators/reader/py_reader.cc
+++ b/paddle/fluid/operators/reader/py_reader.cc
@@ -14,9 +14,7 @@
 
 #include "paddle/fluid/operators/reader/py_reader.h"
 
-namespace paddle {
-namespace operators {
-namespace reader {
+namespace paddle::operators::reader {
 
 PyReader::PyReader(
     const std::shared_ptr<LoDTensorBlockingQueue>& queue,
@@ -44,6 +42,4 @@ void PyReader::Shutdown() { queue_->Close(); }
 
 void PyReader::Start() { queue_->ReOpen(); }
 
-}  // namespace reader
-}  // namespace operators
-}  // namespace paddle
+}  // namespace paddle::operators::reader
diff --git a/paddle/fluid/operators/reader/reader_op_registry.cc b/paddle/fluid/operators/reader/reader_op_registry.cc
index 66e6ceeb4fd1a..9b92e31e0af7c 100644
--- a/paddle/fluid/operators/reader/reader_op_registry.cc
+++ b/paddle/fluid/operators/reader/reader_op_registry.cc
@@ -14,15 +14,11 @@
 
 #include "paddle/fluid/operators/reader/reader_op_registry.h"
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 class VarDesc;
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
 
-namespace paddle {
-namespace operators {
-namespace reader {
+namespace paddle::operators::reader {
 
 std::vector<phi::DDim> RestoreShapes(const std::vector<int>& shape_concat,
                                      const std::vector<int>& ranks) {
@@ -161,7 +157,4 @@ void DecoratedReaderMakerBase::Make() {
   Apply();
 }
 
-}  // namespace reader
-
-}  // namespace operators
-}  // namespace paddle
+}  // namespace paddle::operators::reader
diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op.cc b/paddle/fluid/operators/reduce_ops/logsumexp_op.cc
index 1997d1fb99fd2..73ad94c0a5c6a 100644
--- a/paddle/fluid/operators/reduce_ops/logsumexp_op.cc
+++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.cc
@@ -17,7 +17,7 @@
 #include <vector>
 
 #include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op_function.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/unary.h"
 
diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op_xpu.cc b/paddle/fluid/operators/reduce_ops/logsumexp_op_xpu.cc
index 2ed2e3278acad..fce12ae865173 100644
--- a/paddle/fluid/operators/reduce_ops/logsumexp_op_xpu.cc
+++ b/paddle/fluid/operators/reduce_ops/logsumexp_op_xpu.cc
@@ -14,7 +14,7 @@
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/reduce_ops/reduce_op_function.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/backends/xpu/xpu_header.h"
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
index ba4f188274d18..464a8e547e508 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
@@ -12,19 +12,215 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h"
-
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
 
 #include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/unary.h"
 
+namespace ops = paddle::operators;
 namespace paddle {
 namespace operators {
+class ReduceBaseOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ReduceBaseOp");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "ReduceBaseOp");
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_rank = x_dims.size();
+    auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
+    PADDLE_ENFORCE_GT(dims.size(),
+                      0,
+                      phi::errors::InvalidArgument(
+                          "The input dim dimensions of ReduceBaseOp "
+                          "should be greater than 0. But received the dim "
+                          "dimensions of Reduce = %d.",
+                          dims.size()));
+
+    for (size_t i = 0; i < dims.size(); ++i) {
+      PADDLE_ENFORCE_LT(
+          dims[i],
+          x_rank,
+          phi::errors::InvalidArgument(
+              "The reduce dim index %d should be in the "
+              "range [-dimension(X), dimension(X)] "
+              "which dimension = %d. But received dim index = %d.",
+              i,
+              x_rank,
+              dims[i]));
+      PADDLE_ENFORCE_GE(
+          dims[i],
+          -x_rank,
+          phi::errors::InvalidArgument(
+              "The reduce dim index %d should be in the "
+              "range [-dimension(X), dimension(X)] "
+              "which dimension = %d. But received dim index = %d.",
+              i,
+              x_rank,
+              dims[i]));
+      if (dims[i] < 0) dims[i] = x_rank + dims[i];
+    }
+    sort(dims.begin(), dims.end());
+    bool reduce_all = ctx->Attrs().Get<bool>("reduce_all");
+    bool keep_dim = ctx->Attrs().Get<bool>("keep_dim");
+    if (reduce_all) {
+      if (keep_dim)
+        ctx->SetOutputDim("Out",
+                          common::make_ddim(std::vector<int64_t>(x_rank, 1)));
+      else
+        ctx->SetOutputDim("Out", {1});
+    } else {
+      auto dims_vector = common::vectorize(x_dims);
+      if (keep_dim) {
+        for (size_t i = 0; i < dims.size(); ++i) {
+          dims_vector[dims[i]] = 1;
+        }
+      } else {
+        const int kDelFlag = -2;
+        for (size_t i = 0; i < dims.size(); ++i) {
+          dims_vector[dims[i]] = kDelFlag;
+        }
+        dims_vector.erase(
+            remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
+            dims_vector.end());
+      }
+      if (!keep_dim && dims_vector.size() == 0) {
+        dims_vector.push_back(1);
+      }
+      auto out_dims = common::make_ddim(dims_vector);
+      ctx->SetOutputDim("Out", out_dims);
+      if (dims.size() > 0 && dims[0] != 0) {
+        // Only pass LoD when not reducing on the first dim.
+        ctx->ShareLoD("X", /*->*/ "Out");
+      }
+    }
+  }
+
+  // oneDNN's reduction kernel is optimized only for reducing throughout the
+  // most outer dims, so in case of another type of reduction, it would be
+  // better to fallback to native implementation
+  static bool HasOptimizedOneDNNKernel(const framework::ExecutionContext& ctx) {
+    // native reduce kernels don't support bf16
+    // so oneDNN kernel is enforced in that case
+    if (ctx.Input<phi::DenseTensor>("X")->dtype() == phi::DataType::BFLOAT16)
+      return true;
+
+    if (!ctx.HasAttr("dim") || !ctx.HasAttr("reduce_all")) {
+      return false;
+    }
+
+    auto reduce_dims = ctx.Attr<std::vector<int>>("dim");
+    const bool reduce_all = ctx.Attr<bool>("reduce_all");
+    int ndims = ctx.Input<phi::DenseTensor>("X")->dims().size();
+
+    if (reduce_all) {
+      return true;
+    }
+
+    for (size_t i = 0; i < reduce_dims.size(); ++i) {
+      if (reduce_dims[i] < 0) reduce_dims[i] = ndims + reduce_dims[i];
+    }
+    sort(reduce_dims.begin(), reduce_dims.end());
+    for (size_t i = 0; i < reduce_dims.size(); ++i) {
+      if (reduce_dims[reduce_dims.size() - i - 1] !=
+          static_cast<int>(ndims - i - 1)) {
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+  phi::KernelKey GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    // choose cudnn kernel if the runtime supported.
+    auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+
+    // NOTE(jiahongyu): Below codes originally enclosed by PADDLE_WITH_DNNL
+    if (ctx.Input<phi::DenseTensor>("X")->dims().size() > 5 ||
+        !HasOptimizedOneDNNKernel(ctx)) {
+      this->SetDnnFallback(true);
+    }
+    // NOTE(jiahongyu): Above codes originally enclosed by PADDLE_WITH_DNNL
+
+    if (input_data_type == framework::proto::VarType::FP16) {
+      PADDLE_ENFORCE_EQ(
+          ctx.GetPlace().GetType() == phi::AllocationType::GPU ||
+              ctx.GetPlace().GetType() == phi::AllocationType::XPU ||
+              ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM,
+          true,
+          phi::errors::InvalidArgument(
+              "float16 can only be used on GPU or XPU place"));
+    }
+    return phi::KernelKey(input_data_type, ctx.GetPlace());
+  }
+};
+
+class ReduceGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ReduceBaseOp");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input",
+                   "Out@GRAD",
+                   "ReduceBaseOp");
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_rank = x_dims.size();
+    // TODO(dev): We should delete Infershape and migrate it into
+    // UnchangeInferMeta.In case of 'dim' is Variable, it will
+    // not exist in Attrs but in Inputs.
+    if (ctx->HasAttr("dim")) {
+      auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
+      for (size_t i = 0; i < dims.size(); ++i) {
+        PADDLE_ENFORCE_LT(
+            dims[i],
+            x_rank,
+            phi::errors::InvalidArgument(
+                "The reduce dim index %d should be in the "
+                "range [-dimension(X), dimension(X)], "
+                "which dimension = %d. But received dim index = %d.",
+                i,
+                x_rank,
+                dims[i]));
+        if (dims[i] < 0) dims[i] = x_rank + dims[i];
+      }
+    }
+
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+      ctx->ShareLoD("X", /*->*/ x_grad_name);
+    }
+  }
+
+ protected:
+  phi::KernelKey GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    int out_dtype = ctx.Attr<int>("out_dtype");
+    auto input_data_type =
+        (out_dtype >= 0)
+            ? static_cast<framework::proto::VarType::Type>(out_dtype)
+            : OperatorWithKernel::IndicateVarDataType(
+                  ctx, framework::GradVarName("Out"));
+
+    // NOTE(jiahongyu): Below codes originally enclosed by PADDLE_WITH_DNNL
+    // max 5D tensor is supported
+    if (ctx.Input<phi::DenseTensor>("X")->dims().size() > 5) {
+      dnn_fallback_ = true;
+    }
+    // NOTE(jiahongyu): Above codes originally enclosed by PADDLE_WITH_DNNL
+
+    return phi::KernelKey(input_data_type, ctx.GetPlace());
+  }
+};
 
 // NOTE(dengkaipeng): Input(Out) is unnecessary in reduce_mean_grad
 // calcualtion, but will incur a reduce_mean_grad op after
@@ -65,6 +261,7 @@ class ReduceMeanDoubleGradDescMaker : public framework::GradOpDescMakerBase {
     return ops;
   }
 };
+
 class ReduceMeanDoubleGradOpBaseMaker : public imperative::GradOpBaseMakerBase {
  public:
   using imperative::GradOpBaseMakerBase::GradOpBaseMakerBase;
@@ -89,6 +286,56 @@ class ReduceMeanDoubleGradOpBaseMaker : public imperative::GradOpBaseMakerBase {
   }
 };
 DECLARE_NO_NEED_BUFFER_VARS_INFERER(ReduceMeanGradNoNeedBufferVarInferer, "X");
+
+class ReduceBaseOpMaker : public paddle::framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() final {
+    AddInput("X",
+             "(Tensor) The input tensor. Tensors with rank at most 6 are "
+             "supported.");
+    AddOutput("Out", "(Tensor) The result tensor.");
+    AddAttr<std::vector<int>>(
+        "dim",
+        "(list<int>, default {0}) The dimensions to reduce. "
+        "Must be in the range [-rank(input), rank(input)). "
+        "If `dim[i] < 0`, the dims[i] to reduce is `rank + dims[i]`. "
+        "Note that reducing on the first dim will make the LoD info lost.")
+        .SetDefault({0})
+        .SupportTensor();
+    AddAttr<bool>("keep_dim",
+                  "(bool, default false) "
+                  "If true, retain the reduced dimension with length 1.")
+        .SetDefault(false);
+    AddAttr<bool>("reduce_all",
+                  "(bool, default false) "
+                  "If true, output a scalar reduced along all dimensions.")
+        .SetDefault(false);
+    AddAttr<int>("in_dtype",
+                 "(int, default -1)"
+                 "The dtype of input, default value is -1, the user could not "
+                 "set this value.")
+        .SetDefault(-1);
+    AddAttr<int>(
+        "out_dtype",
+        "(int, default -1)"
+        "The dtype of output, default value is -1, the dtype is same as intput")
+        .SetDefault(-1);
+    AddComment(string::Sprintf(R"DOC(
+%s Operator.
+
+This operator computes the %s of input tensor along the given dimension.
+The result tensor has 1 fewer dimension than the input unless keep_dim is true.
+If reduce_all is true, just reduce along all dimensions and output a scalar.
+
+)DOC",
+                               GetOpType(),
+                               GetName()));
+  }
+
+ protected:
+  virtual std::string GetName() const = 0;
+  virtual std::string GetOpType() const = 0;
+};
 }  // namespace operators
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.h b/paddle/fluid/operators/reduce_ops/reduce_mean_op.h
deleted file mode 100644
index eb82be83ba517..0000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.h
+++ /dev/null
@@ -1,70 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
-
-namespace paddle {
-namespace operators {
-
-struct MeanFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = x->mean(dim);
-  }
-};
-
-struct MeanGradFunctor {
-  template <typename DeviceContext,
-            typename X,
-            typename Y,
-            typename DX,
-            typename DY,
-            typename Dim>
-  void operator()(const DeviceContext& place,
-                  X* x,
-                  Y* y,
-                  DX* dx,
-                  DY* dy,
-                  const Dim& dim,
-                  int size) {
-    dx->device(place) = dy->broadcast(dim) / dx->constant(size);
-  }
-};
-
-// TODO(zengjinle): Should refine the numeric stability of FP16 reduce_mean
-// and reduce_mean_grad later.
-struct FP16MeanGradFunctor {
-  template <typename DeviceContext,
-            typename X,
-            typename Y,
-            typename DX,
-            typename DY,
-            typename Dim>
-  void operator()(const DeviceContext& place,
-                  X* x,
-                  Y* y,
-                  DX* dx,
-                  DY* dy,
-                  const Dim& dim,
-                  int size) {
-    dx->device(place) = (dy->template cast<float>().broadcast(dim) /
-                         dx->template cast<float>().constant(size))
-                            .template cast<phi::dtype::float16>();
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
deleted file mode 100644
index 44a82397dcc07..0000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ /dev/null
@@ -1,895 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <set>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op_function.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-// only can include the headers in paddle/phi/api dirs
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/phi_utils.h"
-#include "paddle/phi/kernels/cpu/reduce.h"
-
-#if defined(__HIPCC__) || defined(__NVCC__) || defined(__xpu__)
-#include "paddle/phi/kernels/gpu/reduce.h"
-#include "paddle/phi/kernels/gpu/reduce_grad.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-#define HANDLE_DIM(NDIM, RDIM)                                   \
-  if (ndim == NDIM && rdim == RDIM) {                            \
-    paddle::operators::                                          \
-        ReduceFunctor<DeviceContext, OutT, NDIM, RDIM, Functor>( \
-            context.template device_context<DeviceContext>(),    \
-            *input,                                              \
-            output,                                              \
-            dims,                                                \
-            keep_dim);                                           \
-  }
-
-using DDim = phi::DDim;
-
-inline void GetShuffledDim(const DDim& src_dims,
-                           DDim* dst_dims,
-                           const std::vector<int>& reduced_dims,
-                           std::vector<int>* perm_axis) {
-  // check if it's a reduced dim
-  std::vector<bool> src_dims_check(src_dims.size(), false);
-  size_t src_size = src_dims.size();
-  size_t reduce_size = reduced_dims.size();
-  for (size_t i = 0; i < reduce_size; ++i) {
-    dst_dims->at(src_size - reduce_size + i) = src_dims[reduced_dims[i]];
-    (*perm_axis)[src_size - reduce_size + i] = reduced_dims[i];
-    src_dims_check[reduced_dims[i]] = true;
-  }
-
-  size_t offset = 0;
-  for (size_t i = 0; i < src_dims_check.size(); ++i) {
-    bool is_reduced = src_dims_check[i];
-    if (!is_reduced) {
-      (*perm_axis)[offset] = i;
-      dst_dims->at(offset++) = src_dims[i];
-    }
-  }
-}
-
-static inline std::vector<int> GetReduceDim(const std::vector<int>& dims,
-                                            int dim_size,
-                                            bool reduce_all) {
-  std::vector<int> reduce_dims;
-  if (reduce_all) {
-    reduce_dims.resize(dim_size);
-    int reduce_size = reduce_dims.size();
-    for (int i = 0; i < reduce_size; ++i) {
-      reduce_dims[i] = i;
-    }
-  } else {
-    for (auto e : dims) {
-      PADDLE_ENFORCE_LT(e,
-                        dim_size,
-                        phi::errors::InvalidArgument(
-                            "ReduceBaseOp: invalid axis, when x_dims is %d, "
-                            "axis[i] should less than x_dims, but got %d.",
-                            dim_size,
-                            e));
-      reduce_dims.push_back(e >= 0 ? e : e + dim_size);
-    }
-  }
-  return reduce_dims;
-}
-template <typename DeviceContext, typename OutT>
-void GetShuffledInput(const framework::ExecutionContext& context,
-                      const phi::DenseTensor* input,
-                      phi::DenseTensor* shuffled_input,
-                      const std::vector<int>& dims) {
-  DDim shuffled_dims(input->dims());
-  std::vector<int> perm_axis(input->dims().size());
-  GetShuffledDim(input->dims(), &shuffled_dims, dims, &perm_axis);
-
-  shuffled_input->Resize(shuffled_dims);
-  shuffled_input->mutable_data<OutT>(context.GetPlace());
-
-  phi::funcs::TransposeNormal<DeviceContext, OutT> trans;
-  trans(context.template device_context<DeviceContext>(),
-        *input,
-        shuffled_input,
-        perm_axis);
-}
-
-inline void GetOriginDimFromShuffled(const DDim& src_dim,
-                                     const std::vector<int>& dims,
-                                     std::vector<int>* origin_dim) {
-  DDim shuffled_dims(src_dim);
-  size_t n = src_dim.size();
-  std::vector<int> perm_axis(n);
-  GetShuffledDim(src_dim, &shuffled_dims, dims, &perm_axis);
-  for (size_t i = 0; i < n; ++i) {
-    (*origin_dim)[perm_axis[i]] = i;
-  }
-}
-
-template <typename DeviceContext, typename OutT, typename Functor>
-void HandleLargeDim(const framework::ExecutionContext& context,
-                    const phi::DenseTensor* input,
-                    phi::DenseTensor* output,
-                    const std::vector<int>& dims,
-                    bool keep_dim) {
-  //  shuffle the reduced dim to the end
-  phi::DenseTensor shuffled_input;
-  GetShuffledInput<DeviceContext, OutT>(context, input, &shuffled_input, dims);
-
-  // transpose to 2D tensor whose shape is {unreduced, reduced}.
-  const int64_t unreduced = output->numel();
-  const int64_t input_numel = shuffled_input.numel();
-  // assume: 0 / 0 == 0, which allow process 0 dim tensor
-  const int64_t reduced = (unreduced != 0) ? (input_numel / unreduced) : 0;
-
-  PADDLE_ENFORCE_EQ(
-      unreduced * reduced,
-      input_numel,
-      phi::errors::InvalidArgument(
-          "Reducing failed in HandleLargeDim, when try to transpose (%d) "
-          "operands into 2D tensor with shape (%d, %d).",
-          input_numel,
-          unreduced,
-          reduced));
-
-  shuffled_input.Resize({unreduced, reduced});
-
-  DDim output_dim = output->dims();
-  output->Resize({unreduced});
-  paddle::operators::ReduceFunctor<DeviceContext, OutT, 2, 1, Functor>(
-      context.template device_context<DeviceContext>(),
-      shuffled_input,
-      output,
-      {1},
-      keep_dim);
-  output->Resize(output_dim);
-}
-
-template <typename DeviceContext, typename T, typename Functor>
-void HandleLargeDimGrad(const framework::ExecutionContext& context,
-                        const phi::DenseTensor* x,
-                        const phi::DenseTensor* out,
-                        const phi::DenseTensor* dout,
-                        phi::DenseTensor* dx,
-                        Functor functor,
-                        const std::vector<int>& dims) {
-  const int64_t unreduced = out->numel();
-  const int64_t x_numel = x->numel();
-  // assume: 0 / 0 == 0, which allow process 0 dim tensor
-  const int64_t reduced = (unreduced != 0) ? (x_numel / unreduced) : 0;
-
-  PADDLE_ENFORCE_EQ(
-      unreduced * reduced,
-      x_numel,
-      phi::errors::InvalidArgument(
-          "Reducing failed in HandleLargeDimGrad, when try to transpose (%d) "
-          "operands into 2D tensor with shape (%d, %d).",
-          x_numel,
-          unreduced,
-          reduced));
-
-  DDim out_dim(out->dims());
-  DDim x_dim(x->dims());
-  // transpose and reshape X
-  phi::DenseTensor shuffled_x;
-  GetShuffledInput<DeviceContext, T>(context, x, &shuffled_x, dims);
-  DDim shuffled_dim = shuffled_x.dims();
-  shuffled_x.Resize({unreduced, reduced});
-  // reshape dX {unreduced, reduced}
-  dx->Resize({unreduced, reduced});
-  ReduceGradFunctor<DeviceContext, T, 2, Functor>(
-      context.template device_context<DeviceContext>(),
-      shuffled_x,
-      *out,
-      *dout,
-      dx,
-      functor,
-      {1});
-  // transpose dX
-  std::vector<int> origin_axis(x_dim.size());
-  GetOriginDimFromShuffled(x_dim, dims, &origin_axis);
-  phi::DenseTensor dx_tmp;
-  framework::TensorCopy(*dx, context.GetPlace(), &dx_tmp);
-  dx_tmp.Resize(shuffled_dim);
-  dx->Resize(x_dim);
-  phi::funcs::TransposeNormal<DeviceContext, T> trans;
-  trans(context.template device_context<DeviceContext>(),
-        dx_tmp,
-        dx,
-        origin_axis);
-}
-
-template <typename DeviceContext, typename T, typename Functor>
-struct ReduceKernelFunctor {
-  const phi::DenseTensor* input;
-  phi::DenseTensor* output;
-  std::vector<int> dims;
-  bool keep_dim;
-  bool reduce_all;
-  const framework::ExecutionContext& context;
-  ReduceKernelFunctor(const phi::DenseTensor* input,
-                      phi::DenseTensor* output,
-                      const std::vector<int>& dims,
-                      bool keep_dim,
-                      bool reduce_all,
-                      const framework::ExecutionContext& context)
-      : input(input),
-        output(output),
-        dims(dims),
-        keep_dim(keep_dim),
-        reduce_all(reduce_all),
-        context(context) {}
-
-  template <typename OutT>
-  void apply() const {
-    output->mutable_data<OutT>(context.GetPlace());
-    if (reduce_all) {
-      // Flatten and reduce 1-D tensor
-      auto x = EigenVector<OutT>::Flatten(*input);
-      auto out = EigenScalar<OutT>::From(*output);
-      auto& place =
-          *context.template device_context<DeviceContext>().eigen_device();
-      auto reduce_dim = Eigen::array<int, 1>({{0}});
-      Functor functor;
-      functor(place, &x, &out, reduce_dim);
-    } else {
-      int ndim = input->dims().size();
-      int rdim = dims.size();
-      if (ndim > 6) {
-        HandleLargeDim<DeviceContext, OutT, Functor>(
-            context, input, output, dims, keep_dim);
-      } else {
-        HANDLE_DIM(6, 5);
-        HANDLE_DIM(6, 4);
-        HANDLE_DIM(6, 3);
-        HANDLE_DIM(6, 2);
-        HANDLE_DIM(6, 1);
-        HANDLE_DIM(5, 4);
-        HANDLE_DIM(5, 3);
-        HANDLE_DIM(5, 2);
-        HANDLE_DIM(5, 1);
-        HANDLE_DIM(4, 3);
-        HANDLE_DIM(4, 2);
-        HANDLE_DIM(4, 1);
-        HANDLE_DIM(3, 2);
-        HANDLE_DIM(3, 1);
-        HANDLE_DIM(2, 1);
-        HANDLE_DIM(1, 1);
-      }
-    }
-  }
-};
-template <typename DeviceContext, typename T, typename Functor>
-class ReduceKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    bool reduce_all = context.Attr<bool>("reduce_all");
-    auto* output = context.Output<phi::DenseTensor>("Out");
-    auto dims = context.Attr<std::vector<int>>("dim");
-    bool keep_dim = context.Attr<bool>("keep_dim");
-    int out_dtype = context.Attr<int>("out_dtype");
-    framework::proto::VarType::Type cast_out_dtype;
-    auto* input = context.Input<phi::DenseTensor>("X");
-
-    if (out_dtype < 0) {
-      cast_out_dtype = static_cast<framework::proto::VarType::Type>(
-          framework::TransToProtoVarType(input->dtype()));
-    } else {
-      cast_out_dtype = static_cast<framework::proto::VarType::Type>(out_dtype);
-    }
-
-    auto& dev_ctx = context.device_context<DeviceContext>();
-    output->mutable_data(
-        dev_ctx.GetPlace(),
-        static_cast<framework::proto::VarType::Type>(cast_out_dtype));
-
-    std::vector<int64_t> tmp_dims(dims.begin(), dims.end());
-
-    // call new kernel
-    phi::Reduce<typename framework::ConvertToPhiContext<DeviceContext>::TYPE,
-                T,
-                Functor>(
-        static_cast<const typename framework::ConvertToPhiContext<
-            DeviceContext>::TYPE&>(dev_ctx),
-        *input,
-        reduce_all,
-        tmp_dims,
-        keep_dim,
-        framework::TransToPhiDataType(cast_out_dtype),
-        output);
-  }
-};
-
-template <typename DeviceContext, typename T, typename Functor>
-void LaunchReduceGradKernel(const framework::ExecutionContext& context,
-                            const phi::DenseTensor* input0,
-                            const phi::DenseTensor* input1,
-                            const phi::DenseTensor* input2,
-                            phi::DenseTensor* output,
-                            Functor functor,
-                            const std::vector<int>& dims,
-                            bool reduce_all = false) {
-  if (reduce_all) {
-    auto x = EigenVector<T>::Flatten(*input0);
-    auto x_reduce = EigenVector<T>::Flatten(*input1);
-    auto x_reduce_grad = EigenVector<T>::Flatten(*input2);
-    auto x_grad = EigenVector<T>::Flatten(*output);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    auto broadcast_dim =
-        Eigen::array<int, 1>({{static_cast<int>(input0->numel())}});
-    functor(place,
-            &x,
-            &x_reduce,
-            &x_grad,
-            &x_reduce_grad,
-            broadcast_dim,
-            broadcast_dim[0]);
-  } else {
-    int rank = input0->dims().size();
-    switch (rank) {
-      case 1:
-        ReduceGradFunctor<DeviceContext, T, 1, Functor>(
-            context.template device_context<DeviceContext>(),
-            *input0,
-            *input1,
-            *input2,
-            output,
-            functor,
-            dims);
-        break;
-      case 2:
-        ReduceGradFunctor<DeviceContext, T, 2, Functor>(
-            context.template device_context<DeviceContext>(),
-            *input0,
-            *input1,
-            *input2,
-            output,
-            functor,
-            dims);
-        break;
-      case 3:
-        ReduceGradFunctor<DeviceContext, T, 3, Functor>(
-            context.template device_context<DeviceContext>(),
-            *input0,
-            *input1,
-            *input2,
-            output,
-            functor,
-            dims);
-        break;
-      case 4:
-        ReduceGradFunctor<DeviceContext, T, 4, Functor>(
-            context.template device_context<DeviceContext>(),
-            *input0,
-            *input1,
-            *input2,
-            output,
-            functor,
-            dims);
-        break;
-      case 5:
-        ReduceGradFunctor<DeviceContext, T, 5, Functor>(
-            context.template device_context<DeviceContext>(),
-            *input0,
-            *input1,
-            *input2,
-            output,
-            functor,
-            dims);
-        break;
-      case 6:
-        ReduceGradFunctor<DeviceContext, T, 6, Functor>(
-            context.template device_context<DeviceContext>(),
-            *input0,
-            *input1,
-            *input2,
-            output,
-            functor,
-            dims);
-        break;
-      default:
-        HandleLargeDimGrad<DeviceContext, T, Functor>(
-            context, input0, input1, input2, output, functor, dims);
-        break;
-    }
-  }
-}
-
-template <typename DeviceContext,
-          typename T,
-          typename Functor,
-          bool kNoNeedBufferX = false,
-          bool kNoNeedBufferY = false>
-class ReduceGradKernel : public framework::OpKernel<T> {
- public:
-  void ComputeFromInput(const phi::DenseTensor* input2,
-                        const framework::ExecutionContext& context) const {
-    bool reduce_all = context.Attr<bool>("reduce_all");
-    auto dims = context.Attr<std::vector<int>>("dim");
-    auto* input0 = context.Input<phi::DenseTensor>("X");
-    auto* input1 = context.Input<phi::DenseTensor>("Out");
-
-    auto* output =
-        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    output->mutable_data<T>(context.GetPlace());
-
-    // The dims has full dim, set the reduce_all is True
-    const auto& input_dim_size =
-        context.Input<phi::DenseTensor>("X")->dims().size();
-    std::set<int> dims_set(dims.begin(), dims.end());
-    bool full_dim = true;
-    for (auto i = 0; i < input_dim_size; i++) {
-      if (dims_set.find(i) == dims_set.end()) {
-        full_dim = false;
-        break;
-      }
-    }
-    reduce_all = (reduce_all || full_dim);
-    // NOTE: EigenTensor::From() uses tensor->data()
-    // if op has NoNeedBufferVarsInferer, the corresponding kNoNeedBufferX or
-    // kNoNeedBufferY should set true
-    // and use fake var that has same dims.
-    if (kNoNeedBufferX) {
-      input0 = output;
-    }
-    if (kNoNeedBufferY) {
-      input1 = input2;
-    }
-
-    const std::vector<int> const_dims = dims;
-
-    // NOTE(dengkaipeng): Out is unnecessary in some reduce kernel and
-    // not be set as Input in grad Maker, use Out_grad to replace here
-    if (!input1) input1 = input2;
-    Functor functor;
-    LaunchReduceGradKernel<DeviceContext, T, Functor>(context,
-                                                      input0,
-                                                      input1,
-                                                      input2,
-                                                      output,
-                                                      functor,
-                                                      const_dims,
-                                                      reduce_all);
-  }
-
-  void Compute(const framework::ExecutionContext& context) const override {
-    int in_dtype = context.Attr<int>("in_dtype");
-    if (in_dtype >= 0) {
-      phi::DenseTensor tmp_tensor;
-      auto* pre_input =
-          context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-      auto in_kernel_type =
-          phi::KernelKey(framework::TransToProtoVarType(pre_input->dtype()),
-                         context.GetPlace());
-      auto out_kernel_type =
-          phi::KernelKey(static_cast<framework::proto::VarType::Type>(in_dtype),
-                         context.GetPlace());
-      framework::TransDataType(
-          in_kernel_type, out_kernel_type, *pre_input, &tmp_tensor);
-      ComputeFromInput(&tmp_tensor, context);
-
-    } else {
-      auto* input2 =
-          context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-      ComputeFromInput(input2, context);
-    }
-  }
-};
-
-class ReduceBaseOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ReduceBaseOp");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "ReduceBaseOp");
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_rank = x_dims.size();
-    auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
-    PADDLE_ENFORCE_GT(dims.size(),
-                      0,
-                      phi::errors::InvalidArgument(
-                          "The input dim dimensions of ReduceBaseOp "
-                          "should be greater than 0. But received the dim "
-                          "dimensions of Reduce = %d.",
-                          dims.size()));
-
-    for (size_t i = 0; i < dims.size(); ++i) {
-      PADDLE_ENFORCE_LT(
-          dims[i],
-          x_rank,
-          phi::errors::InvalidArgument(
-              "The reduce dim index %d should be in the "
-              "range [-dimension(X), dimension(X)] "
-              "which dimension = %d. But received dim index = %d.",
-              i,
-              x_rank,
-              dims[i]));
-      PADDLE_ENFORCE_GE(
-          dims[i],
-          -x_rank,
-          phi::errors::InvalidArgument(
-              "The reduce dim index %d should be in the "
-              "range [-dimension(X), dimension(X)] "
-              "which dimension = %d. But received dim index = %d.",
-              i,
-              x_rank,
-              dims[i]));
-      if (dims[i] < 0) dims[i] = x_rank + dims[i];
-    }
-    sort(dims.begin(), dims.end());
-    bool reduce_all = ctx->Attrs().Get<bool>("reduce_all");
-    bool keep_dim = ctx->Attrs().Get<bool>("keep_dim");
-    if (reduce_all) {
-      if (keep_dim)
-        ctx->SetOutputDim("Out",
-                          common::make_ddim(std::vector<int64_t>(x_rank, 1)));
-      else
-        ctx->SetOutputDim("Out", {1});
-    } else {
-      auto dims_vector = common::vectorize(x_dims);
-      if (keep_dim) {
-        for (size_t i = 0; i < dims.size(); ++i) {
-          dims_vector[dims[i]] = 1;
-        }
-      } else {
-        const int kDelFlag = -2;
-        for (size_t i = 0; i < dims.size(); ++i) {
-          dims_vector[dims[i]] = kDelFlag;
-        }
-        dims_vector.erase(
-            remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
-            dims_vector.end());
-      }
-      if (!keep_dim && dims_vector.size() == 0) {
-        dims_vector.push_back(1);
-      }
-      auto out_dims = common::make_ddim(dims_vector);
-      ctx->SetOutputDim("Out", out_dims);
-      if (dims.size() > 0 && dims[0] != 0) {
-        // Only pass LoD when not reducing on the first dim.
-        ctx->ShareLoD("X", /*->*/ "Out");
-      }
-    }
-  }
-
-  // oneDNN's reduction kernel is optimized only for reducing throughout the
-  // most outer dims, so in case of another type of reduction, it would be
-  // better to fallback to native implementation
-  static bool HasOptimizedOneDNNKernel(const framework::ExecutionContext& ctx) {
-    // native reduce kernels don't support bf16
-    // so oneDNN kernel is enforced in that case
-    if (ctx.Input<phi::DenseTensor>("X")->dtype() == phi::DataType::BFLOAT16)
-      return true;
-
-    if (!ctx.HasAttr("dim") || !ctx.HasAttr("reduce_all")) {
-      return false;
-    }
-
-    auto reduce_dims = ctx.Attr<std::vector<int>>("dim");
-    const bool reduce_all = ctx.Attr<bool>("reduce_all");
-    int ndims = ctx.Input<phi::DenseTensor>("X")->dims().size();
-
-    if (reduce_all) {
-      return true;
-    }
-
-    for (size_t i = 0; i < reduce_dims.size(); ++i) {
-      if (reduce_dims[i] < 0) reduce_dims[i] = ndims + reduce_dims[i];
-    }
-    sort(reduce_dims.begin(), reduce_dims.end());
-    for (size_t i = 0; i < reduce_dims.size(); ++i) {
-      if (reduce_dims[reduce_dims.size() - i - 1] !=
-          static_cast<int>(ndims - i - 1)) {
-        return false;
-      }
-    }
-
-    return true;
-  }
-
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    // choose cudnn kernel if the runtime supported.
-    auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
-
-    // NOTE(jiahongyu): Below codes originally enclosed by PADDLE_WITH_DNNL
-    if (ctx.Input<phi::DenseTensor>("X")->dims().size() > 5 ||
-        !HasOptimizedOneDNNKernel(ctx)) {
-      this->SetDnnFallback(true);
-    }
-    // NOTE(jiahongyu): Above codes originally enclosed by PADDLE_WITH_DNNL
-
-    if (input_data_type == framework::proto::VarType::FP16) {
-      PADDLE_ENFORCE_EQ(
-          ctx.GetPlace().GetType() == phi::AllocationType::GPU ||
-              ctx.GetPlace().GetType() == phi::AllocationType::XPU ||
-              ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM,
-          true,
-          phi::errors::InvalidArgument(
-              "float16 can only be used on GPU or XPU place"));
-    }
-    return phi::KernelKey(input_data_type, ctx.GetPlace());
-  }
-};
-
-class ReduceOpUseInputPlace : public ReduceBaseOp {
- public:
-  using ReduceBaseOp::ReduceBaseOp;
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    phi::KernelKey kt = OperatorWithKernel::GetExpectedKernelType(ctx);
-    kt.set_backend(
-        phi::TransToPhiBackend(ctx.Input<phi::DenseTensor>("X")->place()));
-    return kt;
-  }
-};
-
-class ReduceGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ReduceBaseOp");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input",
-                   "Out@GRAD",
-                   "ReduceBaseOp");
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_rank = x_dims.size();
-    // TODO(dev): We should delete Infershape and migrate it into
-    // UnchangeInferMeta.In case of 'dim' is Variable, it will
-    // not exist in Attrs but in Inputs.
-    if (ctx->HasAttr("dim")) {
-      auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
-      for (size_t i = 0; i < dims.size(); ++i) {
-        PADDLE_ENFORCE_LT(
-            dims[i],
-            x_rank,
-            phi::errors::InvalidArgument(
-                "The reduce dim index %d should be in the "
-                "range [-dimension(X), dimension(X)], "
-                "which dimension = %d. But received dim index = %d.",
-                i,
-                x_rank,
-                dims[i]));
-        if (dims[i] < 0) dims[i] = x_rank + dims[i];
-      }
-    }
-
-    auto x_grad_name = framework::GradVarName("X");
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
-      ctx->ShareLoD("X", /*->*/ x_grad_name);
-    }
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    int out_dtype = ctx.Attr<int>("out_dtype");
-    auto input_data_type =
-        (out_dtype >= 0)
-            ? static_cast<framework::proto::VarType::Type>(out_dtype)
-            : OperatorWithKernel::IndicateVarDataType(
-                  ctx, framework::GradVarName("Out"));
-
-    // NOTE(jiahongyu): Below codes originally enclosed by PADDLE_WITH_DNNL
-    // max 5D tensor is supported
-    if (ctx.Input<phi::DenseTensor>("X")->dims().size() > 5) {
-      dnn_fallback_ = true;
-    }
-    // NOTE(jiahongyu): Above codes originally enclosed by PADDLE_WITH_DNNL
-
-    return phi::KernelKey(input_data_type, ctx.GetPlace());
-  }
-};
-
-class ReduceBaseOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() final {
-    AddInput("X",
-             "(Tensor) The input tensor. Tensors with rank at most 6 are "
-             "supported.");
-    AddOutput("Out", "(Tensor) The result tensor.");
-    AddAttr<std::vector<int>>(
-        "dim",
-        "(list<int>, default {0}) The dimensions to reduce. "
-        "Must be in the range [-rank(input), rank(input)). "
-        "If `dim[i] < 0`, the dims[i] to reduce is `rank + dims[i]`. "
-        "Note that reducing on the first dim will make the LoD info lost.")
-        .SetDefault({0})
-        .SupportTensor();
-    AddAttr<bool>("keep_dim",
-                  "(bool, default false) "
-                  "If true, retain the reduced dimension with length 1.")
-        .SetDefault(false);
-    AddAttr<bool>("reduce_all",
-                  "(bool, default false) "
-                  "If true, output a scalar reduced along all dimensions.")
-        .SetDefault(false);
-    AddAttr<int>("in_dtype",
-                 "(int, default -1)"
-                 "The dtype of input, default value is -1, the user could not "
-                 "set this value.")
-        .SetDefault(-1);
-    AddAttr<int>(
-        "out_dtype",
-        "(int, default -1)"
-        "The dtype of output, default value is -1, the dtype is same as intput")
-        .SetDefault(-1);
-    AddComment(string::Sprintf(R"DOC(
-%s Operator.
-
-This operator computes the %s of input tensor along the given dimension.
-The result tensor has 1 fewer dimension than the input unless keep_dim is true.
-If reduce_all is true, just reduce along all dimensions and output a scalar.
-
-)DOC",
-                               GetOpType(),
-                               GetName()));
-  }
-
- protected:
-  virtual std::string GetName() const = 0;
-  virtual std::string GetOpType() const = 0;
-};
-
-#if defined(__HIPCC__) || defined(__NVCC__) || defined(__xpu__)
-template <typename T,
-          template <typename>
-          class ReduceBaseOp,
-          template <typename, typename>
-          class TransformOp>
-class ReduceCudaKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    bool reduce_all = context.Attr<bool>("reduce_all");
-    const phi::DenseTensor* input = context.Input<phi::DenseTensor>("X");
-    phi::DenseTensor* output = context.Output<phi::DenseTensor>("Out");
-    auto out_dtype = context.Attr<int>("out_dtype");
-    auto pt_out_dtype = paddle::framework::TransToPhiDataType(
-        static_cast<framework::proto::VarType::Type>(out_dtype));
-    std::vector<int> dims = context.Attr<std::vector<int>>("dim");
-#ifdef PADDLE_WITH_XPU_KP
-    auto& dev_ctx = context.template device_context<phi::XPUContext>();
-#else
-    auto& dev_ctx = context.cuda_device_context();
-#endif
-    if (out_dtype >= 0) {
-      output->mutable_data(dev_ctx.GetPlace(), pt_out_dtype);
-    } else {
-      output->mutable_data(dev_ctx.GetPlace(), input->dtype());
-    }
-
-    std::vector<int64_t> dims_int64{dims.begin(), dims.end()};
-
-    phi::Reduce<T, ReduceBaseOp, TransformOp>(
-        dev_ctx, *input, reduce_all, dims_int64, false, pt_out_dtype, output);
-  }
-};
-
-#ifndef PADDLE_WITH_XPU_KP
-template <typename T, template <typename, typename> class TransformOp>
-class ReduceCudaGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    bool reduce_all = context.Attr<bool>("reduce_all");
-    std::vector<int> dims = context.Attr<std::vector<int>>("dim");
-    auto* in_x = context.Input<phi::DenseTensor>("X");
-
-    auto* d_out =
-        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* d_x = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto out_dtype = context.Attr<int>("in_dtype");
-    auto pt_out_dtype = framework::TransToPhiDataType(
-        static_cast<framework::proto::VarType::Type>(out_dtype));
-    // get reduce_dim and reduce_num for reduce_mean_grad
-    int dim_size = in_x->dims().size();
-    std::vector<int> reduce_dims = GetReduceDim(dims, dim_size, reduce_all);
-    auto update_dims = common::vectorize(d_x->dims());
-    int reduce_num = 1;
-    for (auto i : reduce_dims) {
-      reduce_num *= (in_x->dims())[i];
-      update_dims[i] = 1;
-    }
-    // make new tensor
-    phi::DenseTensor new_d_out(d_out->type());
-    new_d_out.ShareDataWith(*d_out);
-    new_d_out.Resize(common::make_ddim(update_dims));
-    auto& dev_ctx = context.cuda_device_context();
-    if (out_dtype > 0) {
-      d_x->mutable_data(dev_ctx.GetPlace(), pt_out_dtype);
-    } else {
-      d_x->mutable_data(dev_ctx.GetPlace(), d_out->dtype());
-    }
-    auto pt_d_out = std::make_unique<phi::DenseTensor>(new_d_out);
-    auto pt_d_x = std::make_unique<phi::DenseTensor>(*d_x);
-    if (out_dtype <= 0) {
-      pt_out_dtype = d_out->dtype();
-    }
-
-    using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-    phi::ReduceGrad<TransformOp<T, MPType>>(dev_ctx,
-                                            pt_d_out.get(),
-                                            pt_d_x.get(),
-                                            pt_out_dtype,
-                                            TransformOp<T, MPType>(reduce_num));
-  }
-};
-
-template <typename T>
-struct EqualFunctor {
-  inline T initial() { return static_cast<T>(0.0f); }
-
-  inline HOSTDEVICE T operator()(const T a, const T b) const {
-    return static_cast<T>(a == b);
-  }
-};
-
-template <typename T, typename Enable = void>
-struct DivideFunctor {
-  inline T initial() { return static_cast<T>(1.0f); }
-
-  inline HOSTDEVICE T operator()(const T a, const T b) const { return a / b; }
-};
-#endif
-#endif
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-#define REGISTER_REDUCE_OP(op_name)                                           \
-  class __##op_name##Maker__ : public ops::ReduceBaseOpMaker {                \
-   protected:                                                                 \
-    virtual std::string GetName() const { return #op_name; }                  \
-    virtual std::string GetOpType() const { return "Reduce " #op_name; }      \
-  };                                                                          \
-  REGISTER_OPERATOR(                                                          \
-      op_name,                                                                \
-      ops::ReduceBaseOp,                                                      \
-      __##op_name##Maker__,                                                   \
-      paddle::framework::DefaultGradOpMaker<paddle::framework::OpDesc, true>, \
-      paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase,       \
-                                            true>);                           \
-  REGISTER_OPERATOR(op_name##_grad, ops::ReduceGradOp)
-
-#define REGISTER_REDUCE_OP_WITHOUT_GRAD(op_name, ...)                    \
-  class __##op_name##Maker__ : public ops::ReduceBaseOpMaker {           \
-   protected:                                                            \
-    virtual std::string GetName() const { return #op_name; }             \
-    virtual std::string GetOpType() const { return "Reduce " #op_name; } \
-  };                                                                     \
-  REGISTER_OPERATOR(                                                     \
-      op_name,                                                           \
-      ops::ReduceBaseOp##__VA_ARGS__,                                    \
-      __##op_name##Maker__,                                              \
-      paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,    \
-      paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op_function.h b/paddle/fluid/operators/reduce_ops/reduce_op_function.h
deleted file mode 100644
index b8043dcd94ba0..0000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_op_function.h
+++ /dev/null
@@ -1,123 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/eigen/common.h"
-
-namespace paddle {
-namespace operators {
-
-using DDim = phi::DDim;
-template <typename T,
-          size_t D,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = phi::EigenTensor<T, D, MajorType, IndexType>;
-template <typename T,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenScalar = phi::EigenScalar<T, MajorType, IndexType>;
-template <typename T,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = phi::EigenVector<T, MajorType, IndexType>;
-
-template <typename DeviceContext,
-          typename T,
-          size_t D,
-          size_t R_D,
-          typename Functor>
-void ReduceFunctor(const DeviceContext& context,
-                   const phi::DenseTensor& input,
-                   phi::DenseTensor* output,
-                   const std::vector<int>& dims,
-                   bool keep_dim) {
-  auto x = EigenTensor<T, D>::From(input);
-  auto x_rank = static_cast<int>(x.dimensions().size());
-  auto reduce_dim = Eigen::array<int, R_D>();
-  std::vector<int> dims_ref = dims;
-  for (size_t i = 0; i < dims_ref.size(); ++i) {
-    if (dims_ref[i] < 0) dims_ref[i] = x_rank + dims_ref[i];
-    reduce_dim[i] = dims_ref[i];
-  }
-  // construct the squeezed output tensor
-  DDim out_dims = output->dims();
-  if (keep_dim && x_rank > 1) {
-    const int kDelFlag = -2;
-    auto dims_vector = common::vectorize(out_dims);
-    for (size_t i = 0; i < dims_ref.size(); ++i) {
-      dims_vector[dims_ref[i]] = kDelFlag;
-    }
-    dims_vector.erase(remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
-                      dims_vector.end());
-    out_dims = common::make_ddim(dims_vector);
-  }
-  auto& place = *context.eigen_device();
-  Functor functor;
-
-  if (D == 1) {
-    auto out = EigenScalar<T>::From(*output);
-    functor(place, &x, &out, reduce_dim);
-  } else {
-    auto out = EigenTensor<T, (D - R_D)>::From(*output, out_dims);
-    functor(place, &x, &out, reduce_dim);
-  }
-}
-
-template <typename DeviceContext, typename T, size_t D, typename Functor>
-void ReduceGradFunctor(const DeviceContext& context,
-                       const phi::DenseTensor& input0,
-                       const phi::DenseTensor& input1,
-                       const phi::DenseTensor& input2,
-                       phi::DenseTensor* output,
-                       Functor functor,
-                       const std::vector<int>& dims) {
-  auto x = EigenTensor<T, D>::From(input0);
-  auto x_grad = EigenTensor<T, D>::From(*output);
-  auto x_rank = static_cast<int>(x.dimensions().size());
-  auto x_dims = input0.dims();
-  auto reduced_dims_v = common::vectorize(x_dims);
-  std::vector<int> dims_ref = dims;
-  Eigen::array<int, D> broadcast_dim;
-  for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1;
-
-  int broad_cats_times = 1;
-  for (size_t i = 0; i < dims_ref.size(); ++i) {
-    if (dims_ref[i] < 0) {
-      dims_ref[i] = x_rank + dims_ref[i];
-    }
-    reduced_dims_v[dims_ref[i]] = 1;
-    broadcast_dim[dims_ref[i]] = x_dims[dims_ref[i]];
-    broad_cats_times *= x_dims[dims_ref[i]];
-  }
-  auto reduced_dims = common::make_ddim(reduced_dims_v);
-  auto x_reduce = EigenTensor<T, D>::From(input1, reduced_dims);
-  auto x_reduce_grad = EigenTensor<T, D>::From(input2, reduced_dims);
-
-  auto& place = *context.eigen_device();
-
-  functor(place,
-          &x,
-          &x_reduce,
-          &x_grad,
-          &x_reduce_grad,
-          broadcast_dim,
-          broad_cats_times);
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/repeat_interleave_op.cc b/paddle/fluid/operators/repeat_interleave_op.cc
index e276ef2082fb6..2ebdd3efa5346 100644
--- a/paddle/fluid/operators/repeat_interleave_op.cc
+++ b/paddle/fluid/operators/repeat_interleave_op.cc
@@ -18,8 +18,7 @@
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-namespace paddle {
-namespace operators {
+namespace paddle::operators {
 
 class RepeatInterleaveOp : public framework::OperatorWithKernel {
  public:
@@ -160,8 +159,7 @@ class RepeatInterleaveGradMaker : public framework::SingleGradOpMaker<T> {
 
 DECLARE_NO_NEED_BUFFER_VARS_INFERER(RepeatInterleaveGradNoNeedBufferVarsInferer,
                                     "X");
-}  // namespace operators
-}  // namespace paddle
+}  // namespace paddle::operators
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/fluid/operators/rrelu_op.cc b/paddle/fluid/operators/rrelu_op.cc
deleted file mode 100644
index 3111ad4e5015d..0000000000000
--- a/paddle/fluid/operators/rrelu_op.cc
+++ /dev/null
@@ -1,131 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/infermeta/unary.h"
-
-namespace paddle {
-namespace operators {
-
-class RReluOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          ctx.GetPlace());
-  }
-};
-
-class RReluOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The input of RReLU op.");
-    AddOutput("Out", "The output of RReLU op.");
-    AddOutput("Noise", "The random sampled RReLU noise.")
-        .AsIntermediate()
-        .AsExtra();
-    AddAttr<bool>("is_test",
-                  "(bool, default false) Set to true for inference only, false "
-                  "for training. Some layers may run faster when this is true.")
-        .SetDefault(false);
-    float default_lower = 1. / 8.;
-    AddAttr<float>("lower", "Lower bound of the uniform distribution.")
-        .SetDefault(default_lower)
-        .AddCustomChecker([](const float& lower) {
-          PADDLE_ENFORCE_EQ(lower >= 0.0f && lower < 1.0f,
-                            true,
-                            phi::errors::InvalidArgument(
-                                "'RRelu_lower' must be between 0.0 and 1.0."));
-        });
-    float defalut_upper = 1. / 3.;
-    AddAttr<float>("upper", "Upper bound of the uniform distribution.")
-        .SetDefault(defalut_upper)
-        .AddCustomChecker([](const float& upper) {
-          PADDLE_ENFORCE_EQ(upper > 0.0f && upper <= 1.0f,
-                            true,
-                            phi::errors::InvalidArgument(
-                                "'RRelu_upper' must be between 0.0 and 1.0."));
-        });
-    AddComment(R"DOC(
-RReLU Operator.
-
-Applies the randomized leaky rectified liner unit function, element-wise,
-as described in the paper:
-
-`Empirical Evaluation of Rectified Activations in Convolutional Network`_.
-
-The function is defined as:
-
-.. math::
-    \text{RReLU}(x) =
-    \begin{cases}
-        x & \text{if } x \geq 0 \\
-        ax & \text{ otherwise }
-    \end{cases}
-
-where :math:`a` is randomly sampled from uniform distribution
-:math:`\mathcal{U}(\text{lower}, \text{upper})`.
-
- See: https://arxiv.org/pdf/1505.00853.pdf
-
-)DOC");
-  }
-};
-
-class RReluGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-};
-
-template <typename T>
-class RReluGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("rrelu_grad");
-    op->SetInput("X", this->Input("X"));
-    op->SetInput("Noise", this->Output("Noise"));
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-DECLARE_INFER_SHAPE_FUNCTOR(rrelu,
-                            RReluInferShapeFunctor,
-                            PD_INFER_META(phi::RReluInferMeta));
-
-REGISTER_OPERATOR(rrelu,
-                  ops::RReluOp,
-                  ops::RReluOpMaker,
-                  ops::RReluGradOpMaker<paddle::framework::OpDesc>,
-                  ops::RReluGradOpMaker<paddle::imperative::OpBase>,
-                  RReluInferShapeFunctor);
-
-DECLARE_INFER_SHAPE_FUNCTOR(rrelu_grad,
-                            RReluGradInferShapeFunctor,
-                            PD_INFER_META(phi::RReluGradInferMeta));
-REGISTER_OPERATOR(rrelu_grad, ops::RReluGradOp, RReluGradInferShapeFunctor);
diff --git a/paddle/fluid/operators/share_data_op.cc b/paddle/fluid/operators/share_data_op.cc
index 074ca142c9567..39bb37907f841 100644
--- a/paddle/fluid/operators/share_data_op.cc
+++ b/paddle/fluid/operators/share_data_op.cc
@@ -16,8 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 
-namespace paddle {
-namespace operators {
+namespace paddle::operators {
 
 class ShareDataOp : public framework::OperatorWithKernel {
  public:
@@ -58,8 +57,7 @@ Return a tensor $Out$ that shares data with the input tensor $X$ and without ten
   }
 };
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace paddle::operators
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/fluid/operators/shuffle_batch_op.cc b/paddle/fluid/operators/shuffle_batch_op.cc
deleted file mode 100644
index 014cf8157d8ea..0000000000000
--- a/paddle/fluid/operators/shuffle_batch_op.cc
+++ /dev/null
@@ -1,169 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <atomic>
-#include <cstring>
-#include <ctime>
-#include <memory>
-#include <random>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "glog/logging.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/var_type_inference.h"
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/platform/timer.h"
-#include "paddle/phi/core/mixed_vector.h"
-#include "paddle/phi/kernels/funcs/eigen/common.h"
-
-namespace paddle {
-namespace operators {
-class ShuffleBatchOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"),
-                      true,
-                      phi::errors::NotFound("Input(X) should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Seed"),
-                      true,
-                      phi::errors::NotFound("Input(Seed) should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"),
-                      true,
-                      phi::errors::NotFound("Output(Out) should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("ShuffleIdx"),
-        true,
-        phi::errors::NotFound("Output(ShuffleIdx) should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("SeedOut"),
-        true,
-        phi::errors::NotFound("Output(SeedOut) should not be null."));
-
-    ctx->ShareDim("X", "Out");
-    ctx->ShareLoD("X", "Out");
-    ctx->ShareDim("Seed", "SeedOut");
-    ctx->ShareLoD("Seed", "SeedOut");
-    ctx->SetOutputDim("ShuffleIdx", common::make_ddim({-1}));
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
-    return phi::KernelKey(data_type, ctx.GetPlace());
-  }
-
-  phi::KernelKey GetKernelTypeForVar(
-      const std::string &var_name,
-      const phi::DenseTensor &tensor,
-      const phi::KernelKey &expected_kernel_type) const override {
-    if (var_name == "Seed") {
-      return phi::KernelKey(phi::Backend::ALL_BACKEND,
-                            expected_kernel_type.layout(),
-                            expected_kernel_type.dtype());
-    }
-    return framework::OperatorWithKernel::GetKernelTypeForVar(
-        var_name, tensor, expected_kernel_type);
-  }
-};
-
-class ShuffleBatchOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(phi::DenseTensor) The input tensor of shuffle_batch op.");
-    AddInput("Seed", "(phi::DenseTensor) The input seed tensor.");
-    AddAttr<int>(
-        "startup_seed",
-        "If input tensor 'Seed' is not initialized, the 'startup_seed' "
-        "will be used to replace it. The seed after shuffle batch will "
-        "be saved in 'SeedOut'. ")
-        .SetDefault(0);
-    AddOutput("Out",
-              "(phi::DenseTensor) The output tensor of shuffle_batch op.");
-    AddOutput("ShuffleIdx", "(Tensor) Record forword shuffle order");
-    AddOutput("SeedOut", "(phi::DenseTensor) Saved new generated seed.");
-    AddComment(R"DOC(
-Shuffle Batch Operator.
-
-This operator is used to shuffle input $X$'s elements.
-
-There is 2 input. The product of input dims (except last dim) numbers of elements will be shuffled. $Seed$ is tensor of seed.
-
-There are 3 outputs. $Out$ is shuffled tensor of input. $ShuffleIdx$ is the tensor used to record shuffle order. $SeedOut$ is same tensor of $Seed$.
-)DOC");
-  }
-};
-
-class ShuffleBatchOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("ShuffleIdx"),
-        true,
-        phi::errors::NotFound("Input(ShuffleIdx) should not be null"));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput(framework::GradVarName("Out")),
-        true,
-        phi::errors::NotFound("Grad Input(Out) should not be null"));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput(framework::GradVarName("X")),
-        true,
-        phi::errors::NotFound("Grad Output(X) should not be null"));
-
-    ctx->ShareDim(framework::GradVarName("Out"), framework::GradVarName("X"));
-    ctx->ShareLoD(framework::GradVarName("Out"), framework::GradVarName("X"));
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    auto data_type = OperatorWithKernel::IndicateVarDataType(
-        ctx, framework::GradVarName("Out"));
-    return phi::KernelKey(data_type, ctx.GetPlace());
-  }
-};
-
-template <typename T>
-class ShuffleBatchGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("shuffle_batch_grad");
-    op->SetInput("ShuffleIdx", this->Output("ShuffleIdx"));
-    op->SetAttrMap(this->Attrs());
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(shuffle_batch,
-                  ops::ShuffleBatchOp,
-                  ops::ShuffleBatchOpMaker,
-                  ops::ShuffleBatchGradOpMaker<paddle::framework::OpDesc>,
-                  ops::ShuffleBatchGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(shuffle_batch_grad, ops::ShuffleBatchOpGrad);
diff --git a/paddle/fluid/operators/sync_batch_norm_op.cc b/paddle/fluid/operators/sync_batch_norm_op.cc
index 2fc8268f71086..103b9d550f4c5 100644
--- a/paddle/fluid/operators/sync_batch_norm_op.cc
+++ b/paddle/fluid/operators/sync_batch_norm_op.cc
@@ -14,8 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/batch_norm_op.h"
 
-namespace paddle {
-namespace operators {
+namespace paddle::operators {
 template <typename T>
 class SyncBatchNormGradMaker : public framework::SingleGradOpMaker<T> {
  public:
@@ -46,8 +45,7 @@ class SyncBatchNormGradMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace paddle::operators
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/fluid/operators/tdm_child_op.cc b/paddle/fluid/operators/tdm_child_op.cc
deleted file mode 100644
index 6e3804fcb0a92..0000000000000
--- a/paddle/fluid/operators/tdm_child_op.cc
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include "paddle/fluid/operators/tdm_child_op.h"
-
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-class TDMChildOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "X(Tensor), dtype support int32/int64, X variable is the "
-             "node id of TDM-Tree");
-    AddInput(
-        "TreeInfo",
-        "TreeInfo(Tensor), dtype support int32/int64, it stores the node "
-        "information in the following format: item_id(shape=1), "
-        "layer_id(shape=1), parent_id(shape=1), child_id(shape=child_nums)");
-    AddAttr<int>("child_nums",
-                 "child_nums(int)"
-                 "The child nums of one node, if the node hasn't enough child, "
-                 "it should padding 0 until child nums equal to child_nums");
-    AddOutput("Child",
-              "Return the children's node_id of input node, "
-              "if input don't have child, return 0");
-    AddOutput("LeafMask",
-              "LeafMask has the same shape with Child"
-              "If child is leaf node, LeafMask value = 1, else = 0");
-    AddAttr<int>("dtype",
-                 "(int, default INT32) "
-                 "Output data type.")
-        .SetDefault(2);
-    AddComment(R"DOC("
-     **Tdm Child**
-     According to the input node_id on the given tree, return the corresponding child node_id and
-      whether child is a leaf node by LeafMask.")DOC");
-  }
-};
-
-class TDMChildOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"),
-                      true,
-                      phi::errors::InvalidArgument(
-                          "Inputs(X) of TdmChild should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("TreeInfo"),
-                      true,
-                      phi::errors::InvalidArgument(
-                          "Inputs(TreeInfo) of TdmChild should not be null."));
-
-    int child_nums = ctx->Attrs().Get<int>("child_nums");
-    PADDLE_ENFORCE_GT(
-        child_nums,
-        0,
-        phi::errors::InvalidArgument(
-            "ValueError: The value of the 'child_nums' must greater than 0. "
-            "But received child_nums value = %d, ",
-            child_nums));
-
-    auto info_dims = ctx->GetInputDim("TreeInfo");
-    auto input_dims = ctx->GetInputDim("X");
-
-    PADDLE_ENFORCE_EQ(
-        info_dims.size(),
-        2,
-        phi::errors::InvalidArgument(
-            "ShapeError: The dimensions of the 'tree info' must be 2. "
-            "But received tree info's dimensions = %d, "
-            "tree info's shape = [%s].",
-            info_dims.size(),
-            info_dims));
-
-    auto output_dims = common::vectorize(input_dims);
-    output_dims.push_back(child_nums);
-    ctx->SetOutputDim("Child", common::make_ddim(output_dims));
-    ctx->SetOutputDim("LeafMask", common::make_ddim(output_dims));
-
-    if (ctx->GetOutputsVarType("Child")[0] ==
-        framework::proto::VarType::LOD_TENSOR) {
-      ctx->ShareLoD("X", /*->*/ "Child");
-      ctx->ShareLoD("X", /*->*/ "LeafMask");
-    }
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
-    return phi::KernelKey(data_type, ctx.GetPlace());
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(
-    tdm_child,
-    ops::TDMChildOp,
-    ops::TDMChildOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
diff --git a/paddle/fluid/operators/tdm_child_op.h b/paddle/fluid/operators/tdm_child_op.h
deleted file mode 100644
index b645566736a9d..0000000000000
--- a/paddle/fluid/operators/tdm_child_op.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#pragma once
-
-#include <cmath>
-#include <fstream>
-#include <set>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "paddle/common/flags.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/mixed_vector.h"
-
-namespace paddle {
-namespace operators {}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/tdm_sampler_op.cc b/paddle/fluid/operators/tdm_sampler_op.cc
deleted file mode 100644
index db2dd6b4ced37..0000000000000
--- a/paddle/fluid/operators/tdm_sampler_op.cc
+++ /dev/null
@@ -1,136 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-
-class TDMSamplerOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "X(Tensor), Input variable which"
-             "mapping the leaf node idx of tdm tree,"
-             "dtype support int32/int64");
-    AddInput("Travel",
-             "Travel(Tensor), must has the same dtype with Layer"
-             "Contains path information of all leaf nodes to root node,"
-             " dtype support int32/64");
-    AddInput("Layer",
-             "Layer(Tensor), must has the same dtype with Travel "
-             "Indicates which nodes are in each layer");
-    AddAttr<bool>("output_positive",
-                  "output_positive(bool)"
-                  "Whether positive samples are included in the output")
-        .SetDefault(true);
-    AddAttr<std::vector<int>>(
-        "neg_samples_num_list",
-        "neg_samples_num_list(python:list[int], C++:vector<int>)"
-        "The num of negative samples in each layer")
-        .SetDefault({});
-    AddAttr<std::vector<int>>("layer_offset_lod",
-                              "offset lod information of Layer")
-        .SetDefault({});
-    AddAttr<int>("seed",
-                 "(int) The seed used in sampler. If it is 0, "
-                 "the sampler will generate a seed randomly.")
-        .SetDefault(0);
-    AddAttr<int>("dtype",
-                 "(int, default INT32) "
-                 "Output data type.")
-        .SetDefault(2);
-    AddOutput("Out",
-              "Sampling result lodTensor, with shape [batch_size, layer_num, "
-              "neg_num_of_layer]");
-    AddOutput("Labels",
-              "Labels of sampling result, has the same shape with Out."
-              "pos samples mapping value 1, neg sample mapping value 0")
-        .AsDispensable();
-    AddOutput(
-        "Mask",
-        "Padding flag of Sampling result, if sampling res comes from padding,"
-        "it will be 0, else 1, lodTensor, with shape [batch_size, "
-        "layer_num, neg_num_of_layer]");
-    AddComment(R"DOC("
-        **TDM Sampler**
-        According to the input positive samples at leaf node, do negative sampling layer by layer on the given tree.")DOC");
-  }
-};
-
-class TDMSamplerOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"),
-                      true,
-                      phi::errors::InvalidArgument(
-                          "Inputs(Input) of TdmSampler should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Travel"),
-                      true,
-                      phi::errors::InvalidArgument(
-                          "Inputs(Travel) of TdmSampler should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Layer"),
-                      true,
-                      phi::errors::InvalidArgument(
-                          "Inputs(Layer) of TdmSampler should not be null."));
-    auto neg_samples_num_vec =
-        ctx->Attrs().Get<std::vector<int>>("neg_samples_num_list");
-    auto output_positive_flag = ctx->Attrs().Get<bool>("output_positive");
-
-    int64_t sample_res_length = 0;
-    for (auto sample_nums : neg_samples_num_vec) {
-      sample_res_length += sample_nums + (int64_t)output_positive_flag;
-    }
-
-    auto input_dims = ctx->GetInputDim("X");
-    auto ddim = common::make_ddim({-1, sample_res_length});
-    if (ctx->IsRuntime()) {
-      auto output_dims = common::vectorize(input_dims);
-      auto batch_size = output_dims[0];
-      ctx->SetOutputDim("Out",
-                        common::make_ddim({batch_size, sample_res_length}));
-      ctx->SetOutputDim("Labels",
-                        common::make_ddim({batch_size, sample_res_length}));
-      ctx->SetOutputDim("Mask",
-                        common::make_ddim({batch_size, sample_res_length}));
-    } else {
-      ctx->SetOutputDim("Out", ddim);
-      ctx->SetOutputDim("Labels", ddim);
-      ctx->SetOutputDim("Mask", ddim);
-    }
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
-    return phi::KernelKey(data_type, ctx.GetPlace());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(
-    tdm_sampler,
-    ops::TDMSamplerOp,
-    ops::TDMSamplerOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
diff --git a/paddle/fluid/operators/transfer_layout_op.cc b/paddle/fluid/operators/transfer_layout_op.cc
deleted file mode 100644
index 19334ca2dad6a..0000000000000
--- a/paddle/fluid/operators/transfer_layout_op.cc
+++ /dev/null
@@ -1,138 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/transfer_layout_op.h"
-
-#include <string>
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/unary.h"
-
-namespace paddle {
-namespace framework {
-class OpDesc;
-class InferShapeContext;
-template <typename T>
-class EmptyGradOpMaker;
-}  // namespace framework
-namespace imperative {
-class OpBase;
-}  // namespace imperative
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-
-class TransferLayoutOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    // kernel's device type is decided by input tensor place
-    auto *in = ctx.InputVar("X");
-    auto *in_tensor = framework::GetLoDTensorOrSelectedRowsValueFromVar(*in);
-    // NOTE(zhiqiu): hot fix, allow empty tensor of kMKLDNN layout to run this
-    // op
-    if (in_tensor->layout() != DataLayout::ONEDNN) {
-      PADDLE_ENFORCE_EQ(in_tensor->IsInitialized(),
-                        true,
-                        phi::errors::PreconditionNotMet(
-                            "The tensor of Input(X) is not initialized."));
-    }
-    auto place =
-        in_tensor->IsInitialized() ? in_tensor->place() : phi::CPUPlace();
-    phi::DataType dtype = in_tensor->IsInitialized() ? in_tensor->dtype()
-                                                     : phi::DataType::FLOAT32;
-    return phi::KernelKey(phi::TransToProtoVarType(dtype), place);
-  }
-
-  phi::KernelKey GetKernelTypeForVar(
-      const std::string &var_name,
-      const phi::DenseTensor &tensor,
-      const phi::KernelKey &expected_kernel_type) const override {
-    return phi::KernelKey(phi::Backend::ALL_BACKEND,
-                          expected_kernel_type.layout(),
-                          expected_kernel_type.dtype());
-  }
-};
-
-class TransferLayoutInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    ctx->SyncTypeAndDataType("X", "Out");
-  }
-};
-
-class TransferLayoutKernel {
- public:
-  void operator()(const framework::ExecutionContext &ctx) const {
-    auto *x = ctx.InputVar("X");
-    auto *out = ctx.OutputVar("Out");
-    auto &dev_ctx = ctx.device_context();
-    auto src_layout = ctx.Attr<int>("src_layout");
-    auto dst_layout = ctx.Attr<int>("dst_layout");
-    auto input_name = ctx.InputName("X");
-    TransferLayoutFunctor(
-        x, out, dev_ctx, src_layout, dst_layout, input_name)();
-  }
-};
-
-class TransferLayoutOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(phi::DenseTensor) The input Tensor");
-    AddOutput("Out",
-              "(phi::DenseTensor) The Output Tensor with desired layout");
-    // NOTE(zhiqiu): in most case, the src_layout is not needed, the op can use
-    // the layout
-    // of input X. However, in some mkldnn kernel, the src layout computed by
-    // GetKernelTypeForVar is different with the layout of tensor X.
-    AddAttr<int>("src_layout",
-                 "kAnyLayout = 0, kNHWC = 1, kNCHW = 2, kMKLDNN = 3, default "
-                 "-1 means unspecified and use the tensor's layout.")
-        .SetDefault(-1);
-    AddAttr<int>("dst_layout",
-                 "kAnyLayout = 0, kNHWC = 1, kNCHW = 2, kMKLDNN = 3");
-    AddComment(R"DOC(
-    TransferLayout Operator)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-DECLARE_INFER_SHAPE_FUNCTOR(transfer_layout,
-                            TransferLayoutInferShapeFunctor,
-                            PD_INFER_META(phi::TransferLayoutInferMeta));
-REGISTER_OPERATOR(
-    transfer_layout,
-    ops::TransferLayoutOp,
-    ops::TransferLayoutOpProtoMaker,
-    ops::TransferLayoutInferVarType,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    TransferLayoutInferShapeFunctor);
-
-REGISTER_OP_VERSION(transfer_layout)
-    .AddCheckpoint(R"ROC(refine transfer_layout, add src_layout attribute)ROC",
-                   paddle::framework::compatible::OpVersionDesc().NewAttr(
-                       "src_layout",
-                       "(int, the layout of the input tensor",
-                       -1));
diff --git a/paddle/fluid/operators/transfer_layout_op.h b/paddle/fluid/operators/transfer_layout_op.h
deleted file mode 100644
index 1b4ef2d1b5abb..0000000000000
--- a/paddle/fluid/operators/transfer_layout_op.h
+++ /dev/null
@@ -1,169 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/framework/data_layout_transform.h"
-#include "paddle/fluid/framework/data_transform.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace phi {
-class DenseTensor;
-}  // namespace phi
-
-namespace paddle {
-namespace framework {
-class Variable;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-using DataLayout = phi::DataLayout;
-
-class TransferLayoutFunctor {
- public:
-  TransferLayoutFunctor(const framework::Variable *in,
-                        framework::Variable *out,
-                        const platform::DeviceContext &dev_ctx,
-                        const int src_layout,
-                        const int dst_layout,
-                        std::string in_name)
-      : in_(in),
-        out_(out),
-        dev_ctx_(dev_ctx),
-        src_layout_(src_layout),
-        dst_layout_(dst_layout),
-        in_name_(in_name) {}
-
-  void operator()() const {
-    auto &in_tensor = *framework::GetLoDTensorOrSelectedRowsValueFromVar(*in_);
-    phi::DenseTensor out_tensor;
-
-    auto out_layout = static_cast<DataLayout>(dst_layout_);
-    out_tensor.set_layout(out_layout);
-
-#ifdef PADDLE_WITH_DNNL
-    // NOTE(zhiqiu): to handle the special case in ApplyDataTransform() in
-    // data_transfer.cc
-    auto in_layout = static_cast<DataLayout>(src_layout_);
-    auto *tensor_out = out_->GetMutable<phi::DenseTensor>();
-    VLOG(4) << in_layout << "->" << out_layout << " " << in_tensor.layout();
-    if (!in_tensor.IsInitialized() && in_layout == DataLayout::ONEDNN &&
-        out_layout == DataLayout::kNHWC) {
-      tensor_out->Resize(in_tensor.dims());
-      tensor_out->set_layout(out_layout);
-      phi::funcs::MatchShapeToLayout(tensor_out, in_layout, out_layout);
-      return;
-    }
-    if (in_layout == DataLayout::ONEDNN || out_layout == DataLayout::ONEDNN) {
-      PADDLE_ENFORCE_NE(
-          in_layout,
-          out_layout,
-          phi::errors::PreconditionNotMet(
-              "No layout transform needed between two oneDNN OPKernels."));
-
-      if (in_layout != DataLayout::ONEDNN && out_layout == DataLayout::ONEDNN) {
-        // Case1 - transform from Non-ONEDNN OPKernel to ONEDNN OPKernel
-        // Just set layout/format. No real transform occur
-
-        auto out_format = phi::funcs::OneDNNFormatForSize(
-            in_tensor.dims().size(), phi::funcs::ToOneDNNFormat(in_layout));
-        out_tensor.ShareDataWith(in_tensor);
-        // For NHWC data we need reshape of tensors as MKL-DNN
-        // is expecting NHWC dims description order
-        if (in_layout == DataLayout::kNHWC) {
-          VLOG(4) << "kNHWC";
-          phi::funcs::MatchShapeToLayout(&out_tensor, in_layout, out_layout);
-          phi::OneDNNContext::tls().set_cur_paddle_data_layout(in_layout);
-        }
-        auto out_tz = out_tensor.dims().size() == 0
-                          ? std::vector<int64_t>{1}
-                          : common::vectorize(out_tensor.dims());
-        dnnl::memory::data_type in_type =
-            phi::funcs::ToOneDNNDataType(in_tensor.dtype());
-
-        dnnl::memory::desc out_mem_desc(out_tz, in_type, out_format);
-        out_tensor.set_mem_desc(out_mem_desc);
-      } else {
-        auto target_layout =
-            phi::OneDNNContext::tls().get_cur_paddle_data_layout();
-        // NOTE(zhiqiu): hot fix, follow the same logic in DataCopy() in
-        // fetch_op.cc
-        if (out_layout == DataLayout::kNCHW &&
-            in_name_ == framework::GradVarName("Filter")) {
-          target_layout = out_layout;
-        }
-        VLOG(4) << "TransDataLayoutFromOneDNN: " << in_layout << "->"
-                << target_layout;
-        // Case2 - transform from ONEDNN OPKernel to Non-ONEDNN OPKernel
-        // Do transform via ONEDNN lib
-        phi::funcs::TransDataLayoutFromOneDNN(in_layout,
-                                              target_layout,
-                                              in_tensor,
-                                              &out_tensor,
-                                              dev_ctx_.GetPlace());
-      }
-    } else {
-      // Case3 - transform between Non-ONEDNN OPKernels
-      TransDataLayout(dev_ctx_, in_tensor, &out_tensor);
-    }
-#else
-    // Case3 - transform between Non-ONEDNN OPKernels
-    TransDataLayout(dev_ctx_, in_tensor, &out_tensor);
-#endif
-    framework::SetTensorToVariable(*in_, out_tensor, out_);
-  }
-
- private:
-  void TransDataLayout(const platform::DeviceContext &dev_ctx,
-                       const phi::DenseTensor &in,
-                       phi::DenseTensor *out) const {
-    PADDLE_ENFORCE_EQ(
-        common::arity(in.dims()),
-        4,
-        phi::errors::InvalidArgument(
-            "Input dimension arity only can be 4, the input dimension is %s.",
-            in.dims()));
-
-    auto src_dim = in.dims();
-    std::vector<int64_t> dst_dim;
-
-    auto axis = framework::GetAxis(in.layout(), out->layout());
-    dst_dim.resize(axis.size());
-    for (size_t i = 0; i < axis.size(); i++) {
-      dst_dim[i] = src_dim[axis[i]];
-    }
-
-    out->Resize(common::make_ddim(dst_dim));
-    out->mutable_data(in.place(), in.type());
-
-    framework::VisitDataType(
-        framework::TransToProtoVarType(in.dtype()),
-        framework::CastDataLayout(&dev_ctx, axis, in, out));
-  }
-
-  const framework::Variable *in_;
-  framework::Variable *out_;
-  const platform::DeviceContext &dev_ctx_;
-  const int src_layout_;
-  const int dst_layout_;
-  std::string in_name_;
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/pir/dialect/CMakeLists.txt b/paddle/fluid/pir/dialect/CMakeLists.txt
index f9b7948de3329..32dda6620d26a 100644
--- a/paddle/fluid/pir/dialect/CMakeLists.txt
+++ b/paddle/fluid/pir/dialect/CMakeLists.txt
@@ -79,6 +79,9 @@ set(op_src_files_tmp
 
 set(op_vjp_src_file_tmp ${op_vjp_source_file_tmp})
 
+set(op_cc_split_num 4)
+set(bwd_op_cc_split_num 2)
+
 # Auto code gen
 execute_process(
   COMMAND ${PYTHON_EXECUTABLE} ${op_parse_file} --op_yaml_path
@@ -95,15 +98,22 @@ execute_process(
     --op_compat_yaml_file ${op_compat_yaml_file} --namespaces ${op_namespace}
     --dialect_name ${dialect_name} --op_def_h_file ${op_header_file_tmp}
     --op_info_file ${op_info_file_tmp} --op_def_cc_file ${op_src_files_tmp}
-    --op_vjp_cc_file ${op_vjp_src_file_tmp} --with_distributed
-    ${WITH_DISTRIBUTE})
+    --op_vjp_cc_file ${op_vjp_src_file_tmp} --op_cc_split_num
+    ${op_cc_split_num} --bwd_op_cc_split_num ${bwd_op_cc_split_num}
+    --with_distributed ${WITH_DISTRIBUTE})
+
+set(split_op_source_files
+    ${PIR_DIALECT_BINARY_DIR}/pd_op1.cc ${PIR_DIALECT_BINARY_DIR}/pd_op2.cc
+    ${PIR_DIALECT_BINARY_DIR}/pd_op3.cc ${PIR_DIALECT_BINARY_DIR}/pd_op4.cc)
+set(split_bwd_op_source_files ${PIR_DIALECT_BINARY_DIR}/pd_op_bwd1.cc
+                              ${PIR_DIALECT_BINARY_DIR}/pd_op_bwd2.cc)
 
 set(generated_files_pd_op
     "${op_header_file}"
     "${op_info_file}"
-    "${op_source_file}"
+    "${split_op_source_files}"
+    "${split_bwd_op_source_files}"
     "${op_vjp_source_file}"
-    "${bwd_op_source_file}"
     "${fused_op_source_file}"
     "${bwd_fused_op_source_file}"
     "${pir_op_source_file}"
@@ -177,7 +187,7 @@ set(python_c_header_file_tmp ${python_c_header_file}.tmp)
 set(python_c_source_file_tmp ${python_c_source_file}.tmp)
 
 set(trimmed_op_yaml_files
-    ${op_fwd_yaml},${op_bwd_yaml},${fused_op_fwd_yaml},${fused_op_bwd_yaml},${pir_op_fwd_yaml},${pir_op_bwd_yaml},${pir_update_op_fwd_yaml}
+    ${op_fwd_yaml},${op_bwd_yaml},${fused_op_fwd_yaml},${fused_op_bwd_yaml},${pir_op_fwd_yaml},${pir_op_bwd_yaml},${pir_update_op_fwd_yaml},${pir_op_fwd_sparse_yaml},${pir_op_bfd_sparse_yaml}
 )
 
 execute_process(
@@ -247,8 +257,8 @@ set(op_dialect_srcs
     ${CMAKE_CURRENT_SOURCE_DIR}/operator/ir/op_attribute.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/operator/ir/op_type.cc
     ${op_info_file}
-    ${op_source_file}
-    ${bwd_op_source_file}
+    ${split_op_source_files}
+    ${split_bwd_op_source_files}
     ${fused_op_source_file}
     ${bwd_fused_op_source_file}
     ${pir_op_source_file}
diff --git a/paddle/fluid/pir/dialect/distributed/ir/dist_api.cc b/paddle/fluid/pir/dialect/distributed/ir/dist_api.cc
index 4d921bed45f4b..195813f87243e 100644
--- a/paddle/fluid/pir/dialect/distributed/ir/dist_api.cc
+++ b/paddle/fluid/pir/dialect/distributed/ir/dist_api.cc
@@ -25,8 +25,7 @@
 #include "paddle/pir/include/core/value.h"
 #include "paddle/utils/flat_hash_map.h"
 
-namespace paddle {
-namespace dialect {
+namespace paddle::dialect {
 
 pir::Value shard_tensor(
     const pir::Value& x,
@@ -64,5 +63,4 @@ pir::Value reshard(const pir::Value& x,
   return reshard_op.result(0);
 }
 
-}  // namespace dialect
-}  // namespace paddle
+}  // namespace paddle::dialect
diff --git a/paddle/fluid/pir/dialect/distributed/ir/dist_op.cc b/paddle/fluid/pir/dialect/distributed/ir/dist_op.cc
index fc261efe9e04c..73dcc128d1fb8 100644
--- a/paddle/fluid/pir/dialect/distributed/ir/dist_op.cc
+++ b/paddle/fluid/pir/dialect/distributed/ir/dist_op.cc
@@ -25,8 +25,7 @@
 #include "paddle/pir/include/core/builtin_op.h"
 #include "paddle/pir/include/core/ir_context.h"
 
-namespace paddle {
-namespace dialect {
+namespace paddle::dialect {
 
 const char* ShardTensorOp::attributes_name[1] = {"op_dist_attr"};  // NOLINT
 const char* ReshardOp::attributes_name[1] = {"op_dist_attr"};      // NOLINT
@@ -346,8 +345,7 @@ void ReshardOp::Build(pir::Builder& builder,
   ::pir::PassStopGradientsDefaultly(argument);
 }
 
-}  // namespace dialect
-}  // namespace paddle
+}  // namespace paddle::dialect
 
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::ShardTensorOp)
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::ReshardOp)
diff --git a/paddle/fluid/pir/dialect/distributed/ir/dist_tools.cc b/paddle/fluid/pir/dialect/distributed/ir/dist_tools.cc
index 505b178a452b0..3ec0fd959cc38 100644
--- a/paddle/fluid/pir/dialect/distributed/ir/dist_tools.cc
+++ b/paddle/fluid/pir/dialect/distributed/ir/dist_tools.cc
@@ -17,8 +17,7 @@
 #include "paddle/common/enforce.h"
 #include "paddle/pir/include/core/operation.h"
 
-namespace paddle {
-namespace dialect {
+namespace paddle::dialect {
 
 bool AllInputAreDist(const std::vector<pir::Value>& inputs) {
   for (auto value : inputs) {
@@ -133,6 +132,7 @@ pir::Attribute CvtToPirAttr(const phi::distributed::ArgDistAttr& dist_attr) {
     auto& vec = PADDLE_GET_CONST(std::vector<phi::distributed::TensorDistAttr>,
                                  dist_attr);
     std::vector<pir::Attribute> array;
+    array.reserve(vec.size());
     for (auto& attr : vec) {
       array.push_back(TensorDistAttribute::get(ctx,
                                                attr.process_mesh(),
@@ -159,10 +159,10 @@ pir::Attribute CreateReplicatedDistAttr(pir::Type prim_type,
   }
   return nullptr;
 }
-pir::Type CvtToPirDistType(pir::Type prim_type, pir::Attribute dist_attr) {
-  if (!prim_type) return nullptr;
+pir::Type CvtToPirDistType(pir::Type global_type, pir::Attribute dist_attr) {
+  if (!global_type) return nullptr;
   auto ctx = pir::IrContext::Instance();
-  if (auto dense_tensor_type = prim_type.dyn_cast<pir::DenseTensorType>()) {
+  if (auto dense_tensor_type = global_type.dyn_cast<pir::DenseTensorType>()) {
     auto tensor_dist_attr = dist_attr.dyn_cast<TensorDistAttribute>();
     if (!tensor_dist_attr) {
       VLOG(0) << "Convert dense tensor type to dist type with attribute {"
@@ -172,7 +172,7 @@ pir::Type CvtToPirDistType(pir::Type prim_type, pir::Attribute dist_attr) {
           "with non-empty TensorDistAttr"));
     }
     return DistDenseTensorType::get(ctx, dense_tensor_type, tensor_dist_attr);
-  } else if (auto vec_type = prim_type.dyn_cast<pir::VectorType>()) {
+  } else if (auto vec_type = global_type.dyn_cast<pir::VectorType>()) {
     auto array_attr = dist_attr.dyn_cast<pir::ArrayAttribute>();
     if (!array_attr) {
       VLOG(0) << "Convert vector type to dist type with attribute {"
@@ -192,8 +192,8 @@ pir::Type CvtToPirDistType(pir::Type prim_type, pir::Attribute dist_attr) {
     }
     return pir::VectorType::get(ctx, dist_vec_type);
   } else {
-    VLOG(0) << "Convert type{" << prim_type << "} to dist type with attribute {"
-            << dist_attr << "}.";
+    VLOG(0) << "Convert type{" << global_type
+            << "} to dist type with attribute {" << dist_attr << "}.";
     PADDLE_THROW(common::errors::InvalidArgument(
         "Currently only support convert dense_tensor_type r vector type to "
         "dist."));
@@ -225,5 +225,4 @@ void CopyLeafOpToMesh(pir::Value value, ProcessMeshAttribute mesh_attr) {
     }
   }
 }
-}  // namespace dialect
-}  // namespace paddle
+}  // namespace paddle::dialect
diff --git a/paddle/fluid/pir/dialect/distributed/ir/dist_tools.h b/paddle/fluid/pir/dialect/distributed/ir/dist_tools.h
index a50331a8ea395..10f76a86e600d 100644
--- a/paddle/fluid/pir/dialect/distributed/ir/dist_tools.h
+++ b/paddle/fluid/pir/dialect/distributed/ir/dist_tools.h
@@ -37,7 +37,7 @@ pir::Attribute CvtToPirAttr(const phi::distributed::ArgDistAttr& dist_attr);
 pir::Attribute CreateReplicatedDistAttr(pir::Type prim_type,
                                         ProcessMeshAttribute mesh);
 
-pir::Type CvtToPirDistType(pir::Type prim_type, pir::Attribute dist_attr);
+pir::Type CvtToPirDistType(pir::Type global_type, pir::Attribute dist_attr);
 
 ///
 /// When the following conditions are met:
diff --git a/paddle/fluid/pir/dialect/distributed/ir/dist_type.cc b/paddle/fluid/pir/dialect/distributed/ir/dist_type.cc
index d1b70c24a1c56..5d03f093175cf 100644
--- a/paddle/fluid/pir/dialect/distributed/ir/dist_type.cc
+++ b/paddle/fluid/pir/dialect/distributed/ir/dist_type.cc
@@ -16,8 +16,7 @@
 #include "paddle/fluid/pir/dialect/distributed/ir/type_storage.h"
 #include "paddle/pir/include/core/ir_context.h"
 
-namespace paddle {
-namespace dialect {
+namespace paddle::dialect {
 
 pir::DenseTensorType DistDenseTensorType::dense_tensor_type() const {
   return storage()->dense_tensor_type;
@@ -69,7 +68,6 @@ pir::DenseTensorType DistDenseTensorType::local_type() const {
                                    offset());
 }
 
-}  // namespace dialect
-}  // namespace paddle
+}  // namespace paddle::dialect
 
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::DistDenseTensorType)
diff --git a/paddle/fluid/pir/dialect/distributed/transforms/dist_to_dense_pass.cc b/paddle/fluid/pir/dialect/distributed/transforms/dist_to_dense_pass.cc
index caccbe69ed091..bbfe95ea4ffa6 100644
--- a/paddle/fluid/pir/dialect/distributed/transforms/dist_to_dense_pass.cc
+++ b/paddle/fluid/pir/dialect/distributed/transforms/dist_to_dense_pass.cc
@@ -38,8 +38,7 @@ using paddle::dialect::DistDenseTensorType;
 
 COMMON_DECLARE_bool(print_ir);
 
-namespace paddle {
-namespace dialect {
+namespace paddle::dialect {
 
 pir::Type CastToLocalType(pir::Type type) {
   if (auto dist_type = type.dyn_cast<DistTypeInterface>()) {
@@ -164,5 +163,4 @@ void DistToDensePass(pir::Program* prog) {
   }
 }
 
-}  // namespace dialect
-}  // namespace paddle
+}  // namespace paddle::dialect
diff --git a/paddle/fluid/pir/dialect/distributed/transforms/fuse_allreduce_split_to_reducescatter_pass.cc b/paddle/fluid/pir/dialect/distributed/transforms/fuse_allreduce_split_to_reducescatter_pass.cc
index 4191eaa4bce50..5d1a9b87431f1 100644
--- a/paddle/fluid/pir/dialect/distributed/transforms/fuse_allreduce_split_to_reducescatter_pass.cc
+++ b/paddle/fluid/pir/dialect/distributed/transforms/fuse_allreduce_split_to_reducescatter_pass.cc
@@ -35,7 +35,11 @@ class FusedAllReduceSplitPattern : public paddle::drr::DrrPatternBase {
     const auto &c_allreduce_sum_ =
         pat.Op(paddle::dialect::CAllreduceSum_Op::name(),
                {{"ring_id", pat.Attr("ring_id")},
-                {"use_calc_stream", pat.Attr("use_calc_stream")}});
+                {"use_calc_stream", pat.Attr("use_calc_stream")},
+                {"execution_stream", pat.Attr("execution_stream")},
+                {"force_record_event", pat.Attr("force_record_event")},
+                {"event_to_record", pat.Attr("event_to_record")},
+                {"events_to_wait", pat.Attr("events_to_wait")}});
     const auto &assign = pat.Op(paddle::dialect::AssignOp::name());
     const auto &full = pat.Op(paddle::dialect::FullOp::name());
     const auto &split_with_num = pat.Op(paddle::dialect::SplitWithNumOp::name(),
@@ -74,7 +78,11 @@ class FusedAllReduceSplitPattern : public paddle::drr::DrrPatternBase {
         res.Op(paddle::dialect::CReducescatterOp::name(),
                {{"ring_id", pat.Attr("ring_id")},
                 {"nranks", pat.Attr("num")},
-                {"use_calc_stream", pat.Attr("use_calc_stream")}});
+                {"use_calc_stream", pat.Attr("use_calc_stream")}},
+               {{"execution_stream", pat.Attr("execution_stream")},
+                {"force_record_event", pat.Attr("force_record_event")},
+                {"event_to_record", pat.Attr("event_to_record")},
+                {"events_to_wait", pat.Attr("events_to_wait")}});
 
     c_reducescatter({&res.Tensor("input_grad_partial")}, {&res.Tensor("out")});
   }
diff --git a/paddle/fluid/pir/dialect/kernel/ir/kernel_op.cc b/paddle/fluid/pir/dialect/kernel/ir/kernel_op.cc
index f1d5b85e357d1..01687ca360257 100644
--- a/paddle/fluid/pir/dialect/kernel/ir/kernel_op.cc
+++ b/paddle/fluid/pir/dialect/kernel/ir/kernel_op.cc
@@ -19,8 +19,7 @@
 #include "paddle/phi/core/enforce.h"
 #include "paddle/pir/include/core/builtin_attribute.h"
 
-namespace paddle {
-namespace dialect {
+namespace paddle::dialect {
 
 const char* PhiKernelOp::attributes_name[attributes_num] = {  // NOLINT
     "op_name",
@@ -260,8 +259,7 @@ phi::KernelKey OneDNNLegacyKernelOp::kernel_key() {
 }
 #endif
 
-}  // namespace dialect
-}  // namespace paddle
+}  // namespace paddle::dialect
 
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::PhiKernelOp)
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::LegacyKernelOp)
diff --git a/paddle/fluid/pir/dialect/op_generator/api_gen.py b/paddle/fluid/pir/dialect/op_generator/api_gen.py
index f7cbf42580b9d..521e9ea90bbf0 100644
--- a/paddle/fluid/pir/dialect/op_generator/api_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/api_gen.py
@@ -114,7 +114,7 @@
     }}
 """
 
-AMP_OPTIONAL_INPUTS_TEMPLATE = """if ({optional_input}) amp_values_vector.push_back({vec_optional_input});
+AMP_OPTIONAL_INPUTS_TEMPLATE = """if ({optional_input}) {{ amp_values_vector.push_back({vec_optional_input}); }}
 """
 
 AMP_NEW_INPUTS_TEMPLATE = """auto new_{input} = paddle::imperative::{cast_func}("{input}", {input}, amp_dst_dtype, op_name);
diff --git a/paddle/fluid/pir/dialect/op_generator/op_gen.py b/paddle/fluid/pir/dialect/op_generator/op_gen.py
index 36d3a26f680a0..ed4b1bae54650 100644
--- a/paddle/fluid/pir/dialect/op_generator/op_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/op_gen.py
@@ -14,6 +14,7 @@
 
 import argparse
 import logging
+import math
 import os
 import pathlib
 import sys
@@ -1130,6 +1131,21 @@ def get_mutable_attribute_grad_semantic(op_info, op_info_items):
     return mutable_attribute_grad_semantics
 
 
+def split_ops(op_info_items: dict, cc_file, split_nums):
+    op_list = list(op_info_items.keys())
+    ops_max_size = math.ceil(len(op_list) / split_nums)
+    split_op_info_items = []
+    for i in range(split_nums):
+        split_op_info_items.append({})
+    for i, op_name in enumerate(op_list):
+        list_idx = math.ceil((i + 1) / ops_max_size) - 1
+        split_op_info_items[list_idx][op_name] = op_info_items[op_name]
+    split_cc_files = []
+    for i in range(split_nums):
+        split_cc_files.append(cc_file.replace(".cc", f"{i + 1}.cc"))
+    return split_op_info_items, split_cc_files
+
+
 def GenOneDnnExtraAttrsDefaultValue(onednn_extra_args):
     INTARRAY_STR_TEMPLATE = """  pir::Attribute attr_{attr_name} = {op_attribute_type}::get(pir::IrContext::Instance(), phi::IntArray({attr}));
 """
@@ -2080,6 +2096,8 @@ def OpGenerator(
     op_info_file,
     op_def_cc_file,
     op_vjp_cc_file,
+    op_cc_split_num,
+    bwd_op_cc_split_num,
     onednn_yaml_file,
     ops_onednn_extra_yaml_file,
 ):
@@ -2126,9 +2144,11 @@ def OpGenerator(
 
     op_infos = []
     all_op_info_items = {}
+    new_op_def_cc_file = []
     first_file = True
     onednn_only_op_list = []
-    for yaml_file in op_yaml_files:
+    for idx in range(len(op_yaml_files)):
+        yaml_file = op_yaml_files[idx]
         op_yaml_items = []
         with open(yaml_file, "r") as f:
             ops = yaml.safe_load(f)
@@ -2194,13 +2214,37 @@ def OpGenerator(
             key_suffix = '_sp' if item.is_sparse_op else ''
             op_info_items[op['name'] + key_suffix] = item
             all_op_info_items[op['name'] + key_suffix] = item
-        op_infos.append(op_info_items)
+
+        if dialect_name != "onednn_op":
+            cc_file = op_def_cc_file[idx]
+            if (
+                yaml_file.split('/')[-1] == "ops.parsed.yaml"
+                and op_cc_split_num is not None
+            ):
+                split_op_info_items, split_cc_files = split_ops(
+                    op_info_items, cc_file, op_cc_split_num
+                )
+                op_infos.extend(split_op_info_items)
+                new_op_def_cc_file.extend(split_cc_files)
+            elif (
+                yaml_file.split('/')[-1] == "backward.parsed.yaml"
+                and bwd_op_cc_split_num is not None
+            ):
+                split_op_info_items, split_cc_files = split_ops(
+                    op_info_items, cc_file, bwd_op_cc_split_num
+                )
+                op_infos.extend(split_op_info_items)
+                new_op_def_cc_file.extend(split_cc_files)
+            else:
+                op_infos.append(op_info_items)
+                new_op_def_cc_file.append(cc_file)
 
         if first_file:
             first_file = False
 
     if dialect_name == "onednn_op":
         op_infos = [all_op_info_items]
+        new_op_def_cc_file = op_def_cc_file
     # (3) auto code gen
     op_list_strs = []
     declare_type_id_strs = []
@@ -2329,7 +2373,7 @@ def OpGenerator(
             f.write(op_info_str)
 
     # (6) write to files for xx_op.cc.tmp
-    for id in range(len(op_def_cc_file)):
+    for id in range(len(new_op_def_cc_file)):
         source_file_str = source_file_strs[id]
         for name in reversed(namespaces):
             source_file_str = NAMESPACE_GARD_TEMPLATE.format(
@@ -2349,7 +2393,7 @@ def OpGenerator(
             input=source_file_str,
             define_type_id=define_type_id_strs[id],
         )
-        with open(op_def_cc_file[id], 'w') as f:
+        with open(new_op_def_cc_file[id], 'w') as f:
             f.write(source_file_str)
 
     # (6) write to files for xx_vjp_op.cc.tmp
@@ -2381,6 +2425,8 @@ def ParseArguments():
     parser.add_argument('--op_info_file', type=str)
     parser.add_argument('--op_def_cc_file', type=str)
     parser.add_argument('--op_vjp_cc_file', type=str)
+    parser.add_argument('--op_cc_split_num', type=int)
+    parser.add_argument('--bwd_op_cc_split_num', type=int)
     parser.add_argument('--onednn_yaml_file', type=str)
     parser.add_argument('--ops_onednn_extra_yaml_file', type=str)
     parser.add_argument('--with_distributed', type=strtobool)
@@ -2403,6 +2449,8 @@ def ParseArguments():
     op_info_file = args.op_info_file
     op_def_cc_files = args.op_def_cc_file.split(",")
     op_vjp_cc_file = args.op_vjp_cc_file
+    op_cc_split_num = args.op_cc_split_num
+    bwd_op_cc_split_num = args.bwd_op_cc_split_num
     onednn_yaml_file = args.onednn_yaml_file
     ops_onednn_extra_yaml_file = args.ops_onednn_extra_yaml_file
 
@@ -2417,6 +2465,8 @@ def ParseArguments():
         op_info_file,
         op_def_cc_files,
         op_vjp_cc_file,
+        op_cc_split_num,
+        bwd_op_cc_split_num,
         onednn_yaml_file,
         ops_onednn_extra_yaml_file,
     )
diff --git a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
index 090aab4e3c4ed..1eb784ed8c0e9 100644
--- a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
@@ -59,15 +59,36 @@
   }}
 }}"""
 
+
 STATIC_ONLY_FUNCTION_IMPL_TEMPLATE = """
 static PyObject *{name}(PyObject *self, PyObject *args, PyObject *kwargs) {{
   VLOG(6) << "Call static_api_{name}";
   return static_api_{name}(self, args, kwargs);
 }}"""
 
+SPARSE_FUNCTION_IMPL_TEMPLATE = """
+static PyObject *sparse_{name}(PyObject *self, PyObject *args, PyObject *kwargs) {{
+  if (egr::Controller::Instance().GetCurrentTracer() == nullptr) {{
+    VLOG(6) << "Call static_api_{name}";
+    return static_api_{name}{name_suffix}(self, args, kwargs);
+  }} else {{
+    VLOG(6) << "Call eager_api_{name}";
+    return sparse::eager_api_{name}(self, args, kwargs);
+  }}
+}}"""
+
+SPARSE_STATIC_ONLY_FUNCTION_IMPL_TEMPLATE = """
+static PyObject *sparse_{name}(PyObject *self, PyObject *args, PyObject *kwargs) {{
+  VLOG(6) << "Call static_api_{name}";
+  return static_api_{name}{name_suffix}(self, args, kwargs);
+}}"""
+
 OPS_API_TEMPLATE = """
 {{"{name}", (PyCFunction)(void (*)(void)){name}, METH_VARARGS | METH_KEYWORDS, "C++ interface function for {name}."}},"""
 
+SPARSE_OPS_API_TEMPLATE = """
+{{"sparse_{name}", (PyCFunction)(void (*)(void))sparse_{name}, METH_VARARGS | METH_KEYWORDS, "C++ interface function for sparse_{name}."}},"""
+
 NEED_GEN_STATIC_ONLY_APIS = [
     'c_allreduce_avg_',
     'c_reduce_avg',
@@ -158,6 +179,7 @@
     'fused_adam_',
     'fused_batch_norm_act_',
     'fused_bn_add_activation_',
+    'fused_elemwise_activation',
     'fused_elemwise_add_activation',
     'fused_scale_bias_relu_conv_bn',
     'fused_scale_bias_add_relu',
@@ -168,6 +190,8 @@
     'fused_elementwise_div',
     'fused_elementwise_mul',
     'fused_elementwise_sub',
+    'fusion_group',
+    'fusion_lstm',
     'fusion_seqpool_cvm_concat',
     'nce',
     'lars_momentum',
@@ -240,9 +264,17 @@ def _gen_one_function_impl(self, name):
         else:
             return FUNCTION_IMPL_TEMPLATE.format(name=name)
 
+    def _gen_sparse_one_function_impl(self, name, name_suffix):
+        return SPARSE_FUNCTION_IMPL_TEMPLATE.format(
+            name=name, name_suffix=name_suffix
+        )
+
     def _gen_one_ops_api(self, name):
         return OPS_API_TEMPLATE.format(name=name)
 
+    def _gen_sparse_one_ops_api(self, name):
+        return SPARSE_OPS_API_TEMPLATE.format(name=name)
+
     def gen_cpp_file(
         self, op_yaml_files, op_compat_yaml_file, namespaces, cpp_file_path
     ):
@@ -255,22 +287,15 @@ def gen_cpp_file(
             for op_name in op_info.op_phi_name:
                 if self._need_skip(op_info, op_name):
                     continue
-                sparse_op_inplace_name_suffix = ''
-                sparse_op_name_suffix = ''
-                if op_name[-1] == "_":
-                    function_impl_str += self._gen_one_function_impl(
-                        op_name + sparse_op_inplace_name_suffix
-                    )
-                    ops_api_str += self._gen_one_ops_api(
-                        op_name + sparse_op_inplace_name_suffix
+                if op_info.is_sparse_op:
+                    op_name_suffix = "sp_" if op_name[-1] == "_" else "_sp"
+                    function_impl_str += self._gen_sparse_one_function_impl(
+                        op_name, op_name_suffix
                     )
+                    ops_api_str += self._gen_sparse_one_ops_api(op_name)
                 else:
-                    function_impl_str += self._gen_one_function_impl(
-                        op_name + sparse_op_name_suffix
-                    )
-                    ops_api_str += self._gen_one_ops_api(
-                        op_name + sparse_op_name_suffix
-                    )
+                    function_impl_str += self._gen_one_function_impl(op_name)
+                    ops_api_str += self._gen_one_ops_api(op_name)
 
         inner_body = NAMESPACE_INNER_TEMPLATE.format(
             function_impl=function_impl_str, ops_api=ops_api_str
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.cc
index 0aec58d385311..1dcb931607f13 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.cc
@@ -17,6 +17,7 @@
 #include "paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h"
 
 namespace cinn::dialect {
+using paddle::dialect::details::CreateShapeOrDataForXShape;
 
 bool BroadcastOpInferSymbolicShape(
     pir::Operation *op, pir::InferSymbolicShapeContext *infer_context) {
@@ -192,6 +193,11 @@ bool ReshapeOpInferSymbolicShape(
   }();
 
   infer_context->SetShapeOrDataForValue(op->result(0), shape_data);
+  // NOTE(Aureliue84): Parse XShape symbolic expression which is used for
+  // backward process. It will be removed after normolizing ReshapeGrad(out,
+  // xshape) into ReshapeGrad(out, x).
+  infer_context->SetShapeOrDataForValue(op->result(1),
+                                        CreateShapeOrDataForXShape(x_dim_expr));
 
   return true;
 }
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.cc
index 7b6cc088a9c3b..61dbdff08e064 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.cc
@@ -99,6 +99,18 @@ bool ReduceInferDim(pir::Operation *op,
   return true;
 }
 
+symbol::ShapeOrDataDimExprs CreateShapeOrDataForXShape(
+    const symbol::ShapeOrDataDimExprs &x_dim_exprs) {
+  const auto InsertZeros =
+      [](const std::vector<symbol::DimExpr> &dims) -> decltype(auto) {
+    auto out_dims = dims;
+    out_dims.insert(out_dims.begin(), 0);
+    return out_dims;
+  };
+  const auto &x_dims = x_dim_exprs.shape();
+  return symbol::TensorShapeOrDataDimExprs(InsertZeros(x_dims));
+}
+
 void BuildCstrEqForTensorListAlongAxis(
     pir::InferSymbolicShapeContext *infer_context,
     const symbol::TensorListShapeOrDataDimExprs &shape_data_list,
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h
index c6e348140981f..a510c828cdf9e 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h
@@ -118,6 +118,9 @@ inline ExprVec GetExprVecFromShape(const ShapeOrData &shapeordata) {
   }
 }
 
+symbol::ShapeOrDataDimExprs CreateShapeOrDataForXShape(
+    const symbol::ShapeOrDataDimExprs &x_dim_exprs);
+
 std::optional<std::vector<int64_t>> VecExpr2Int64(const ExprVec &expr_vec);
 
 ExprVec VecInt642Expr(const std::vector<int64_t> &int_vec);
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc
index 5c7f01606c2df..777868c691c74 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc
@@ -274,7 +274,7 @@ bool ConcatOpInferSymbolicShape(pir::Operation *op,
       SetShapeOrDataForAxis(axis);
     } else {
       pir::Value res = op->result(0);
-      infer_context->SetStaticShapeForValue(res);
+      infer_context->SetSymbolForValueByStaticShape(res);
       // update axis value
       auto res_shape = infer_context->GetShapeOrDataForValue(res);
       for (size_t i = 0; i < rank; ++i) {
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_result.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_result.cc
index 3909b64651c40..03f48884cf165 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_result.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_result.cc
@@ -178,10 +178,6 @@ bool ScaleOpInferSymbolicShape(pir::Operation *op,
 
 }  // namespace paddle::dialect
 
-namespace cinn::dialect {
-using paddle::dialect::ReverseOpInferSymbolicShape;
-using paddle::dialect::ScaleOpInferSymbolicShape;
-using paddle::dialect::SelectOpInferSymbolicShape;
-}  // namespace cinn::dialect
+namespace cinn::dialect {}  // namespace cinn::dialect
 
 #undef OP_SAME_OPERANDS_AND_RESULT
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.cc
index 083918ebcd6bc..a98fa1ce7c9b5 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.cc
@@ -17,6 +17,7 @@
 #include "paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h"
 
 namespace paddle::dialect {
+using paddle::dialect::details::CreateShapeOrDataForXShape;
 
 bool ArgmaxOpInferSymbolicShape(pir::Operation *op,
                                 pir::InferSymbolicShapeContext *infer_context) {
@@ -586,19 +587,6 @@ bool RepeatInterleaveOpInferSymbolicShape(
   return true;
 }
 
-symbol::ShapeOrDataDimExprs CreateShapeOrDataForXShape(
-    const symbol::ShapeOrDataDimExprs &x_shape) {
-  const std::vector<symbol::DimExpr> result = [&] {
-    std::vector<symbol::DimExpr> new_x_dims;
-    new_x_dims.reserve(x_shape.shape().size() + 1);
-    new_x_dims.push_back(symbol::DimExpr{0});
-    new_x_dims.insert(
-        new_x_dims.end(), x_shape.shape().begin(), x_shape.shape().end());
-    return new_x_dims;
-  }();
-  return symbol::ShapeOrDataDimExprs{symbol::TensorShapeOrDataDimExprs(result)};
-}
-
 bool ReshapeOpInferSymbolicShape(
     pir::Operation *op, pir::InferSymbolicShapeContext *infer_context) {
   const symbol::ShapeOrDataDimExprs &x_dim_expr =
diff --git a/paddle/fluid/pir/dialect/operator/interface/layout_transformation.cc b/paddle/fluid/pir/dialect/operator/interface/layout_transformation.cc
index 783e56a3c505e..50f5e9f622ac6 100644
--- a/paddle/fluid/pir/dialect/operator/interface/layout_transformation.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/layout_transformation.cc
@@ -22,8 +22,7 @@
 #include "paddle/pir/include/core/ir_context.h"
 #include "paddle/pir/include/pass/utils.h"
 
-namespace paddle {
-namespace dialect {
+namespace paddle::dialect {
 
 template <typename ConcreteOp>
 void RewriteByInfermeta(pir::Operation* op, common::DataLayout new_layout) {
@@ -38,6 +37,45 @@ void RewriteByInfermeta(pir::Operation* op, common::DataLayout new_layout) {
   }
 }
 
+template <>
+std::vector<pir::Value> RelevantInputsImpl<AddGroupNormSiluOp>(
+    pir::Operation* op) {
+  auto concrete_op = op->dyn_cast<AddGroupNormSiluOp>();
+  return {concrete_op.x(), concrete_op.residual()};
+}
+
+template <>
+std::vector<pir::Value> RelevantOutputsImpl<AddGroupNormSiluOp>(
+    pir::Operation* op) {
+  auto concrete_op = op->dyn_cast<AddGroupNormSiluOp>();
+  return {concrete_op.y(), concrete_op.residual_out()};
+}
+
+template <>
+common::DataLayout PreferLayoutImpl<AddGroupNormSiluOp>(pir::Operation* op) {
+  // Note(bukejiyu): add_group_norm_silu only supports NHWC layout now.
+  return common::DataLayout::NHWC;
+}
+
+template <>
+void RewriteByLayoutImpl<AddGroupNormSiluOp>(pir::Operation* op,
+                                             common::DataLayout new_layout) {
+  op->set_attribute(
+      "data_format",
+      pir::StrAttribute::get(pir::IrContext::Instance(),
+                             common::DataLayoutToString(new_layout)));
+
+  std::vector<pir::Type> new_outputs = AddGroupNormSiluOp::InferMeta(
+      op->operands_source(), const_cast<pir::AttributeMap*>(&op->attributes()));
+  for (size_t i = 0; i < new_outputs.size(); ++i) {
+    op->result(i).set_type(new_outputs[i]);
+  }
+
+  for (auto value : RelevantOutputsImpl<AddGroupNormSiluOp>(op)) {
+    SetNewLayoutForValue(value, new_layout);
+  }
+}
+
 template <>
 common::DataLayout PreferLayoutImpl<Conv2dOp>(pir::Operation* op) {
   auto data_format_attr = op->attribute<pir::StrAttribute>("data_format");
@@ -48,13 +86,27 @@ common::DataLayout PreferLayoutImpl<Conv2dOp>(pir::Operation* op) {
         data_format_attr));
   }
 
-  // Note(lyk): We exhibit the layout transformation for conv2d
-  // due to issues with its infermeta and kernel not functioning
-  // properly in NHWC layout. However, if the FLAGS_manually_trans_conv_filter
-  // is enabled, the transfer_layout_pass can also operate correctly.
+  auto concrete_op = op->dyn_cast<Conv2dOp>();
+  if (auto in = concrete_op.input()) {
+    if (auto in_type = in.type()) {
+      if (in_type.isa<DenseTensorType>()) {
+        if (auto tensor_type = in_type.dyn_cast<DenseTensorType>()) {
+          if (tensor_type.dtype().isa<pir::Float16Type>()) {
+            return common::DataLayout::NHWC;
+          }
+        }
+      }
+    }
+  }
+
   return common::StringToDataLayout(data_format_attr.AsString());
 }
 
+template <>
+bool CanBeModifiedImpl<Conv2dOp>(pir::Operation* op) {
+  return false;
+}
+
 template <>
 void RewriteByLayoutImpl<Conv2dOp>(pir::Operation* op,
                                    common::DataLayout new_layout) {
@@ -78,6 +130,14 @@ common::DataLayout PreferLayoutImpl<FusedConv2dAddActOp>(pir::Operation* op) {
   auto original_layout =
       common::StringToDataLayout(data_format_attr.AsString());
 
+  if (op->HasAttribute(kForceBackendAttr) &&
+      op->attributes()
+              .at(kForceBackendAttr)
+              .dyn_cast<pir::StrAttribute>()
+              .AsString() == "gpu") {
+    return common::DataLayout::NHWC;
+  }
+
   auto concrete_op = op->dyn_cast<FusedConv2dAddActOp>();
   if (auto in = concrete_op.input()) {
     if (auto in_type = in.type()) {
@@ -124,6 +184,31 @@ void RewriteByLayoutImpl<FusedConv2dAddActOp>(pir::Operation* op,
   RewriteByInfermeta<FusedConv2dAddActOp>(op, new_layout);
 }
 
+template <>
+bool CanBeModifiedImpl<FusedConv2dAddActOp>(pir::Operation* op) {
+  auto data_format_attr = op->attribute<pir::StrAttribute>("data_format");
+  if (!data_format_attr) {
+    PADDLE_THROW(phi::errors::InvalidArgument(
+        "op (%s) should have attribute `data_format`, but got %s",
+        op,
+        data_format_attr));
+  }
+  auto cur_layout = common::StringToDataLayout(data_format_attr.AsString());
+  auto prefer_layout = PreferLayoutImpl<FusedConv2dAddActOp>(op);
+  auto can_be_modified = cur_layout != prefer_layout;
+
+  for (auto value : RelevantOutputsImpl<FusedConv2dAddActOp>(op)) {
+    // TODO(lyk) if value was used in another block, we cannot rewrite this op
+    for (auto it = value.use_begin(); it != value.use_end(); ++it) {
+      if (it->owner()->GetParent() != op->GetParent()) {
+        return false;
+      }
+    }
+  }
+
+  return can_be_modified;
+}
+
 template <>
 void RewriteByLayoutImpl<GroupNormOp>(pir::Operation* op,
                                       common::DataLayout new_layout) {
@@ -319,6 +404,5 @@ void RewriteByLayoutImpl<SwishOp>(pir::Operation* op,
   RewriteByInfermeta<SwishOp>(op, new_layout);
 }
 
-}  // namespace dialect
-}  // namespace paddle
+}  // namespace paddle::dialect
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::LayoutTransformationInterface)
diff --git a/paddle/fluid/pir/dialect/operator/interface/layout_transformation.hpp b/paddle/fluid/pir/dialect/operator/interface/layout_transformation.hpp
index 05719bc1dfb2f..cb4767498bf23 100644
--- a/paddle/fluid/pir/dialect/operator/interface/layout_transformation.hpp
+++ b/paddle/fluid/pir/dialect/operator/interface/layout_transformation.hpp
@@ -105,9 +105,11 @@ bool CanBeModifiedImpl(pir::Operation* op) {
 class FusedConv2dAddActOp;
 OVERLOAD_PREFER_LAYOUT(FusedConv2dAddActOp);
 OVERLOAD_REWRITE_BY_LAYOUT(FusedConv2dAddActOp);
+OVERLOAD_CAN_BE_MODIFIED(FusedConv2dAddActOp);
 
 class Conv2dOp;
 OVERLOAD_PREFER_LAYOUT(Conv2dOp);
+OVERLOAD_CAN_BE_MODIFIED(Conv2dOp);
 OVERLOAD_REWRITE_BY_LAYOUT(Conv2dOp);
 
 class GroupNormOp;
@@ -115,6 +117,12 @@ OVERLOAD_REWRITE_BY_LAYOUT(GroupNormOp);
 OVERLOAD_RELEVANT_INPUTS(GroupNormOp);
 OVERLOAD_RELEVANT_OUTPUTS(GroupNormOp);
 
+class AddGroupNormSiluOp;
+OVERLOAD_REWRITE_BY_LAYOUT(AddGroupNormSiluOp);
+OVERLOAD_PREFER_LAYOUT(AddGroupNormSiluOp);
+OVERLOAD_RELEVANT_INPUTS(AddGroupNormSiluOp);
+OVERLOAD_RELEVANT_OUTPUTS(AddGroupNormSiluOp);
+
 class ReshapeOp;
 OVERLOAD_RELEVANT_INPUTS(ReshapeOp);
 OVERLOAD_RELEVANT_OUTPUTS(ReshapeOp);
diff --git a/paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc b/paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc
index 269bc4f115b13..f2b6702233c7d 100644
--- a/paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc
@@ -40,8 +40,7 @@ paddle::dialect::IfOp, paddle::dialect::WhileOp, paddle::dialect::HasElementsOp,
 using pir::TuplePopOp;
 using pir::TuplePushOp;
 constexpr char kStopGradientAttrName[] = "stop_gradient";  // NOLINT
-namespace paddle {
-namespace dialect {
+namespace paddle::dialect {
 
 void IfOp::Build(pir::Builder &builder,             // NOLINT
                  pir::OperationArgument &argument,  // NOLINT
@@ -1198,8 +1197,7 @@ void SelectOutputOp::VerifySig() {
   VLOG(4) << "End Verifying for: AssignArray_Op.";
 }
 
-}  // namespace dialect
-}  // namespace paddle
+}  // namespace paddle::dialect
 
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::IfOp)
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::WhileOp)
diff --git a/paddle/fluid/pir/dialect/operator/ir/ir_selected_rows.cc b/paddle/fluid/pir/dialect/operator/ir/ir_selected_rows.cc
index fed0c6b224097..70b0f72bee55f 100644
--- a/paddle/fluid/pir/dialect/operator/ir/ir_selected_rows.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/ir_selected_rows.cc
@@ -17,8 +17,7 @@
 #include <utility>
 #include "paddle/common/enforce.h"
 
-namespace paddle {
-namespace dialect {
+namespace paddle::dialect {
 IrSelectedRows::IrSelectedRows(phi::DataType dtype,
                                const phi::DDim& dims,
                                phi::DataLayout layout,
@@ -71,5 +70,4 @@ void* IrSelectedRows::AllocateFrom(phi::Allocator* allocator,
   IR_THROW("Don't use IrSelectedRows::AllocateFrom method.");
 }
 
-}  // namespace dialect
-}  // namespace paddle
+}  // namespace paddle::dialect
diff --git a/paddle/fluid/pir/dialect/operator/ir/ir_tensor.cc b/paddle/fluid/pir/dialect/operator/ir/ir_tensor.cc
index 020fafabb89e9..ea9a9d8b4b20f 100644
--- a/paddle/fluid/pir/dialect/operator/ir/ir_tensor.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/ir_tensor.cc
@@ -18,8 +18,7 @@
 
 #include "paddle/common/enforce.h"
 
-namespace paddle {
-namespace dialect {
+namespace paddle::dialect {
 IrTensor::IrTensor(phi::DataType dtype,
                    const phi::DDim& dims,
                    phi::DataLayout layout,
@@ -70,5 +69,4 @@ void* IrTensor::AllocateFrom(phi::Allocator* allocator,
   IR_THROW("Don't use IrTensor::AllocateFrom method.");
 }
 
-}  // namespace dialect
-}  // namespace paddle
+}  // namespace paddle::dialect
diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_api.cc b/paddle/fluid/pir/dialect/operator/ir/manual_api.cc
index 7fb835dd01c90..2d705364b970f 100644
--- a/paddle/fluid/pir/dialect/operator/ir/manual_api.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_api.cc
@@ -71,11 +71,11 @@ void set_parameter(const pir::Value& parameter, const std::string& name) {
   }
 }
 
-void updata_parameter(const pir::Value& parameter, const std::string& name) {
+void update_parameter(const pir::Value& parameter, const std::string& name) {
   pir::Parameter* param = ApiBuilder::Instance().GetParameter(name);
   PADDLE_ENFORCE_NOT_NULL(param,
                           phi::errors::InvalidArgument(
-                              "Parameter %s not exist, can not updata.", name));
+                              "Parameter %s not exist, can not update.", name));
   std::unique_ptr<pir::Parameter> param_new(
       new pir::Parameter(nullptr, 0, parameter.type()));
   ApiBuilder::Instance().SetParameter(name, std::move(param_new));
diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_api.h b/paddle/fluid/pir/dialect/operator/ir/manual_api.h
index 86d9b9a8245cc..7a89ae9eafaa8 100644
--- a/paddle/fluid/pir/dialect/operator/ir/manual_api.h
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_api.h
@@ -36,7 +36,7 @@ pir::Value parameter(const std::string& name);
 
 void set_parameter(const pir::Value& parameter, const std::string& name);
 
-void updata_parameter(const pir::Value& parameter, const std::string& name);
+void update_parameter(const pir::Value& parameter, const std::string& name);
 
 void shadow_output(const pir::Value& persist_value, const std::string& name);
 
diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_op.cc b/paddle/fluid/pir/dialect/operator/ir/manual_op.cc
index f266d6d172d35..96e24a6d10490 100644
--- a/paddle/fluid/pir/dialect/operator/ir/manual_op.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_op.cc
@@ -56,8 +56,7 @@ paddle::dialect::AddNOp, paddle::dialect::AddN_Op, paddle::dialect::AddNArrayOp,
 #include "paddle/phi/infermeta/spmd_rules/rules.h"
 #endif
 
-namespace paddle {
-namespace dialect {
+namespace paddle::dialect {
 
 OpInfoTuple AddNOp::GetOpInfo() {
   std::vector<paddle::dialect::OpInputInfo> inputs = {
@@ -4513,8 +4512,7 @@ phi::DataType ArrayPopOp::GetKernelTypeForVar(
   return expected_kernel_dtype;
 }
 
-}  // namespace dialect
-}  // namespace paddle
+}  // namespace paddle::dialect
 
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::AddNOp)
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::SplitGradOp)
diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_op_decomp.cc b/paddle/fluid/pir/dialect/operator/ir/manual_op_decomp.cc
index 05e30459029f5..246a08910308c 100644
--- a/paddle/fluid/pir/dialect/operator/ir/manual_op_decomp.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_op_decomp.cc
@@ -98,12 +98,22 @@ std::vector<std::vector<pir::Value>> BatchNormOp::Decomp(pir::Operation* op) {
   res[2].push_back(std::static_pointer_cast<primitive::LazyTensor>(
                        std::get<2>(op_res).impl())
                        ->value());
-  res[3].push_back(std::static_pointer_cast<primitive::LazyTensor>(
-                       std::get<3>(op_res).impl())
-                       ->value());
-  res[4].push_back(std::static_pointer_cast<primitive::LazyTensor>(
-                       std::get<4>(op_res).impl())
-                       ->value());
+  if (std::get<3>(op_res).initialized()) {
+    res[3].push_back(std::static_pointer_cast<primitive::LazyTensor>(
+                         std::get<3>(op_res).impl())
+                         ->value());
+  } else {
+    pir::Value saved_mean;
+    res[3].push_back(saved_mean);
+  }
+  if (std::get<4>(op_res).initialized()) {
+    res[4].push_back(std::static_pointer_cast<primitive::LazyTensor>(
+                         std::get<4>(op_res).impl())
+                         ->value());
+  } else {
+    pir::Value saved_var;
+    res[4].push_back(saved_var);
+  }
   pir::Value reserve_space;
   res[5].push_back(reserve_space);
 
@@ -180,12 +190,23 @@ std::vector<std::vector<pir::Value>> BatchNorm_Op::Decomp(pir::Operation* op) {
   res[2].push_back(std::static_pointer_cast<primitive::LazyTensor>(
                        std::get<2>(op_res).impl())
                        ->value());
-  res[3].push_back(std::static_pointer_cast<primitive::LazyTensor>(
-                       std::get<3>(op_res).impl())
-                       ->value());
-  res[4].push_back(std::static_pointer_cast<primitive::LazyTensor>(
-                       std::get<4>(op_res).impl())
-                       ->value());
+  if (std::get<3>(op_res).initialized()) {
+    res[3].push_back(std::static_pointer_cast<primitive::LazyTensor>(
+                         std::get<3>(op_res).impl())
+                         ->value());
+  } else {
+    pir::Value saved_mean;
+    res[3].push_back(saved_mean);
+  }
+  if (std::get<4>(op_res).initialized()) {
+    res[4].push_back(std::static_pointer_cast<primitive::LazyTensor>(
+                         std::get<4>(op_res).impl())
+                         ->value());
+  } else {
+    pir::Value saved_var;
+    res[4].push_back(saved_var);
+  }
+
   pir::Value reserve_space;
   res[5].push_back(reserve_space);
 
diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_op_decomp_vjp.cc b/paddle/fluid/pir/dialect/operator/ir/manual_op_decomp_vjp.cc
index c1aa3d776b67e..7b15459837fd9 100644
--- a/paddle/fluid/pir/dialect/operator/ir/manual_op_decomp_vjp.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_op_decomp_vjp.cc
@@ -61,7 +61,7 @@ std::vector<std::vector<pir::Value>> StackGradOp::DecompVjp(
     auto stop_gradients_attr = op->attribute(kAttrStopGradients)
                                    .dyn_cast<pir::ArrayAttribute>()
                                    .AsVector();
-    for (size_t i = 0; i < stop_gradients[0].size(); ++i) {
+    for (size_t i = 0; i < stop_gradients_attr.size(); ++i) {
       stop_gradients[0].push_back(
           stop_gradients_attr[i].dyn_cast<pir::BoolAttribute>().data());
     }
@@ -144,24 +144,31 @@ std::vector<std::vector<pir::Value>> ConcatGradOp::DecompVjp(
                     .dyn_cast<paddle::dialect::ScalarAttribute>()
                     .data();
 
-  VLOG(6) << "Decomp call concat_grad's backward composite rule prepare";
+  VLOG(4) << "Decomp call concat_grad's backward composite rule prepare";
 
   std::vector<std::vector<bool>> stop_gradients(op->results().size());
-  if (combine_op_obj_x->HasAttribute(kAttrStopGradients)) {
-    auto stop_gradients_attr = op->attribute(kAttrStopGradients)
-                                   .dyn_cast<pir::ArrayAttribute>()
-                                   .AsVector();
-    for (size_t i = 0; i < stop_gradients[0].size(); ++i) {
-      stop_gradients[0].push_back(
-          stop_gradients_attr[i].dyn_cast<pir::BoolAttribute>().data());
+  auto splitop = op->results()[0].first_use().owner();
+
+  if (splitop->HasAttribute("current_bwd_op_stop_gradients")) {
+    auto stop_gradients_attr =
+        splitop->attribute("current_bwd_op_stop_gradients")
+            .dyn_cast<pir::ArrayAttribute>()
+            .AsVector();
+    for (size_t i = 0; i < stop_gradients_attr.size(); ++i) {
+      auto stop_gradients_attr_j =
+          stop_gradients_attr[i].dyn_cast<pir::ArrayAttribute>().AsVector();
+      for (size_t j = 0; j < stop_gradients_attr_j.size(); ++j) {
+        stop_gradients[0].push_back(
+            stop_gradients_attr_j[j].dyn_cast<pir::BoolAttribute>().data());
+      }
     }
 
-    VLOG(4) << " stop_gradients is set ";
+    VLOG(4) << " op stop_gradients is set ";
   } else {
     std::vector<bool> x_grad_stop_gradient(combine_op_obj_x.inputs().size(),
                                            false);
     stop_gradients[0] = x_grad_stop_gradient;
-    VLOG(4) << " stop_gradients is not set ";
+    VLOG(4) << " op stop_gradients is not set ";
   }
 
   std::vector<std::vector<paddle::Tensor>> tensor_res;
@@ -179,6 +186,7 @@ std::vector<std::vector<pir::Value>> ConcatGradOp::DecompVjp(
 
   paddle::primitive::details::concat_grad<primitive::LazyTensor>(
       x, out_grad, axis, x_grad);
+  VLOG(4) << "Call Pir Decomposed backward op concat_grad end";
   std::vector<std::vector<pir::Value>> res(tensor_res.size());
 
   for (size_t i = 0; i < tensor_res.size(); ++i) {
diff --git a/paddle/fluid/pir/dialect/operator/ir/op_attribute.cc b/paddle/fluid/pir/dialect/operator/ir/op_attribute.cc
index 8a843a8881734..4eb8190eaa111 100644
--- a/paddle/fluid/pir/dialect/operator/ir/op_attribute.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/op_attribute.cc
@@ -16,8 +16,7 @@
 #include "paddle/common/enforce.h"
 #include "paddle/common/errors.h"
 
-namespace paddle {
-namespace dialect {
+namespace paddle::dialect {
 const phi::IntArray &IntArrayAttribute::data() const {
   return storage()->GetAsKey();
 }
@@ -130,8 +129,7 @@ DataLayoutAttribute DataLayoutAttribute::Parse(
       parser.ctx, StringToDataLayoutMap().at(datalayout_token_val));
 }
 
-}  // namespace dialect
-}  // namespace paddle
+}  // namespace paddle::dialect
 
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::IntArrayAttribute)
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::ScalarAttribute)
diff --git a/paddle/fluid/pir/dialect/operator/ir/op_dialect.cc b/paddle/fluid/pir/dialect/operator/ir/op_dialect.cc
index 481742a807ac6..275667c1edc27 100644
--- a/paddle/fluid/pir/dialect/operator/ir/op_dialect.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/op_dialect.cc
@@ -38,8 +38,7 @@
 #include "paddle/fluid/pir/dialect/operator/ir/manual_onednn_op.h"
 #endif
 
-namespace paddle {
-namespace dialect {
+namespace paddle::dialect {
 
 struct CombineOpInferSymbolicShapeInterfaceModel
     : public InferSymbolicShapeInterface::Concept {
@@ -1064,8 +1063,7 @@ void CustomOpDialect::RegisterCustomOp(const paddle::OpMetaInfo& op_meta) {
                                verify_func,
                                verify_func);
 }
-}  // namespace dialect
-}  // namespace paddle
+}  // namespace paddle::dialect
 
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::OperatorDialect)
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::CustomOpDialect)
diff --git a/paddle/fluid/pir/dialect/operator/transforms/param_to_variable.cc b/paddle/fluid/pir/dialect/operator/transforms/param_to_variable.cc
index 1d93e27c59b0b..78cb8e6460769 100644
--- a/paddle/fluid/pir/dialect/operator/transforms/param_to_variable.cc
+++ b/paddle/fluid/pir/dialect/operator/transforms/param_to_variable.cc
@@ -21,8 +21,7 @@
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/dense_tensor.h"
 
-namespace paddle {
-namespace dialect {
+namespace paddle::dialect {
 std::shared_ptr<paddle::framework::Variable>
 ParameterConvertInterface::ParameterToVariable(pir::Parameter *parameter) {
   if (parameter->type().isa<DenseTensorType>()) {
@@ -79,7 +78,6 @@ std::unique_ptr<pir::Parameter> ParameterConvertInterface::VariableToParameter(
   }
 }
 
-}  // namespace dialect
-}  // namespace paddle
+}  // namespace paddle::dialect
 
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::ParameterConvertInterface)
diff --git a/paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.cc b/paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.cc
index 32c45d20f8d25..7ba06ac2944ad 100644
--- a/paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.cc
+++ b/paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.cc
@@ -17,8 +17,7 @@
 #include <utility>
 #include "paddle/phi/core/enforce.h"
 
-namespace paddle {
-namespace dialect {
+namespace paddle::dialect {
 
 OpYamlInfoParser::OpYamlInfoParser(OpInfoTuple op_info_tuple, bool is_legacy_op)
     : op_info_tuple_(std::move(op_info_tuple)), is_legacy_op_(is_legacy_op) {
@@ -239,5 +238,4 @@ int OpYamlInfoParser::GetTensorParamIndexByArgsName(
   }
 }
 
-}  // namespace dialect
-}  // namespace paddle
+}  // namespace paddle::dialect
diff --git a/paddle/fluid/pir/dialect/operator/utils/utils.cc b/paddle/fluid/pir/dialect/operator/utils/utils.cc
index 4f752932c6ba6..3552cf88a0765 100644
--- a/paddle/fluid/pir/dialect/operator/utils/utils.cc
+++ b/paddle/fluid/pir/dialect/operator/utils/utils.cc
@@ -45,8 +45,6 @@ const std::unordered_set<std::string> LegacyOpList = {
     CSyncCommStream_Op::name(),
     DistributedPushSparseOp::name(),
     FtrlOp::name(),
-    FusedElemwiseAddActivationOp::name(),
-    FusedElemwiseAddActivationGradOp::name(),
     DpsgdOp::name(),
     SendV2Op::name(),
     RecvV2Op::name(),
@@ -85,7 +83,6 @@ const std::unordered_set<std::string> LegacyOpList = {
     paddle::onednn::dialect::LrnOp::name(),
     paddle::onednn::dialect::LrnGradOp::name(),
     paddle::onednn::dialect::MultiGruOp::name(),
-    paddle::onednn::dialect::FusionLstmOp::name(),
 #endif
     CReduceAvgOp::name(),
     CReduceAvg_Op::name(),
diff --git a/paddle/fluid/pir/drr/src/match_context_impl.h b/paddle/fluid/pir/drr/src/match_context_impl.h
index a9acb5f6ed8df..ce6911fb36ecb 100644
--- a/paddle/fluid/pir/drr/src/match_context_impl.h
+++ b/paddle/fluid/pir/drr/src/match_context_impl.h
@@ -17,6 +17,7 @@
 #include <memory>
 #include <unordered_map>
 
+#include "glog/logging.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
 #include "paddle/fluid/pir/drr/include/drr_pattern_context.h"
 #include "paddle/fluid/pir/drr/src/attr_type_uilts.h"
@@ -100,27 +101,32 @@ class MatchContextImpl final {
     tensor_map_.emplace(value_name, value);
   }
 
-  void BindIrOperation(const OpCall* op_call, pir::Operation* op) {
+  bool BindIrOperation(const OpCall* op_call, pir::Operation* op) {
     operation_map_.emplace(op_call, op);
     const auto& attrs = op_call->attributes();
     for (const auto& kv : attrs) {
-      std::visit(
+      bool bind_success = std::visit(
           [&](auto&& arg) {
             if constexpr (std::is_same_v<std::decay_t<decltype(arg)>,
                                          NormalAttribute>) {
-              PADDLE_ENFORCE(
-                  op->HasAttribute(kv.first),
-                  phi::errors::NotFound(
-                      "Not found attribute [%s] in Op [%s], please check the "
-                      "validity of the attribute name[%s].",
-                      kv.first,
-                      op->name(),
-                      kv.first));
-              BindIrAttr(arg.name(), op->attribute(kv.first));
+              if (op->HasAttribute(kv.first)) {
+                BindIrAttr(arg.name(), op->attribute(kv.first));
+                return true;
+              }
             }
+            return false;
           },
           kv.second);
+      if (!bind_success) {
+        LOG(WARNING) << "Not found attribute [" << kv.first << "] in Op ["
+                     << op->name()
+                     << "], please check the "
+                        "validity of the attribute name["
+                     << kv.first << "].";
+        return false;
+      }
     }
+    return true;
   }
 
  private:
diff --git a/paddle/fluid/pir/drr/src/pattern_graph.cc b/paddle/fluid/pir/drr/src/pattern_graph.cc
index 632f41d2adbb9..b0c2dbcd58ae2 100644
--- a/paddle/fluid/pir/drr/src/pattern_graph.cc
+++ b/paddle/fluid/pir/drr/src/pattern_graph.cc
@@ -20,8 +20,7 @@
 #include "paddle/fluid/pir/drr/include/drr_pattern_context.h"
 #include "paddle/phi/core/enforce.h"
 
-namespace paddle {
-namespace drr {
+namespace paddle::drr {
 
 const drr::OpCall &PatternGraph::AddOpCall(
     const std::shared_ptr<drr::OpCall> &op_call) {
@@ -228,5 +227,4 @@ std::ostream &operator<<(std::ostream &os, const PatternGraph &pattern_graph) {
   return os;
 }
 
-}  // namespace drr
-}  // namespace paddle
+}  // namespace paddle::drr
diff --git a/paddle/fluid/pir/drr/src/rewrite_pattern.cc b/paddle/fluid/pir/drr/src/rewrite_pattern.cc
index 53b7ec0c919e9..93095af050afe 100644
--- a/paddle/fluid/pir/drr/src/rewrite_pattern.cc
+++ b/paddle/fluid/pir/drr/src/rewrite_pattern.cc
@@ -356,7 +356,10 @@ bool DrrRewritePattern::MatchFromOutputToInput(
       break;
     }
     // Step 1: Bind Operation of current op to match_ctx.
-    source_pattern_match_ctx->BindIrOperation(drr_node, ir_node);
+    if (!source_pattern_match_ctx->BindIrOperation(drr_node, ir_node)) {
+      matched = false;
+      break;
+    }
 
     // Step 2: Bind input_tensor of current op to match_ctx.
     const auto& drr_input_tensors = drr_node->inputs();
@@ -391,7 +394,7 @@ bool DrrRewritePattern::MatchFromOutputToInput(
           ir_input_values[i].use_count()) {
         matched = false;
         VLOG(8) << drr_node->name() << " Match failed: consumers of drr intput["
-                << i << "] { " << drr_node->outputs().size()
+                << i << "] { " << drr_input_tensors[i]->consumers().size()
                 << " } != consumers of pir intput[" << i << "] { "
                 << ir_input_values[i].use_count() << " }.";
         break;
@@ -495,8 +498,9 @@ MatchContextImpl DrrRewritePattern::CreateOperations(
     }
 
     // set insert point
-    size_t max_input_op_index = 0UL;
-    pir::Operation* max_index_op = nullptr;
+    // 1. get result pattern max-idx of input op
+    size_t max_res_idx = 0UL;
+    pir::Operation* max_res_idx_op = nullptr;
     for (const Tensor* input : op_call.inputs()) {
       if (input->is_none()) {
         continue;
@@ -506,18 +510,16 @@ MatchContextImpl DrrRewritePattern::CreateOperations(
         pir::Operation* ir_input_op = ir_val.defining_op();
         if (op_2_temp_program_index.count(ir_input_op) == 0) {
           // do nothing
-        } else if (max_input_op_index <
-                   op_2_temp_program_index.at(ir_input_op)) {
-          max_input_op_index = op_2_temp_program_index.at(ir_input_op);
-          max_index_op = ir_input_op;
-        } else if (max_input_op_index ==
-                   op_2_temp_program_index.at(ir_input_op)) {
-          const auto& ops_vec = temp_program[max_input_op_index];
+        } else if (max_res_idx < op_2_temp_program_index.at(ir_input_op)) {
+          max_res_idx = op_2_temp_program_index.at(ir_input_op);
+          max_res_idx_op = ir_input_op;
+        } else if (max_res_idx == op_2_temp_program_index.at(ir_input_op)) {
+          const auto& ops_vec = temp_program[max_res_idx];
           for (auto it = ops_vec.begin(); it != ops_vec.end(); it++) {
-            if (*it == max_index_op) {
+            if (*it == max_res_idx_op) {
               break;
             } else if (*it == ir_input_op) {
-              max_index_op = ir_input_op;
+              max_res_idx_op = ir_input_op;
               break;
             } else {
               // do nothing
@@ -528,25 +530,29 @@ MatchContextImpl DrrRewritePattern::CreateOperations(
         }
       }
     }
-    if (max_input_op_index == 0UL) {
-      VLOG(6) << "Not found producer op for (" << op_call.name() << ")";
-      pir::Operation* source_pattern_first_op = src_match_ctx.IrOperation(
-          source_pattern_graph.owned_op_call()[0].get());
-      max_input_op_index = op_2_temp_program_index[source_pattern_first_op];
-      rewriter.set_insertion_point(source_pattern_first_op);
-    } else {
-      rewriter.SetInsertionPointAfter(max_index_op);
-    }
 
-    pir::Operation* new_op =
-        CreateOperation(op_call, src_match_ctx, rewriter, &res_match_ctx);
+    // 2. get source pattern min-idx op
+    pir::Operation* min_src_idx_op = src_match_ctx.IrOperation(
+        source_pattern_graph.owned_op_call()[0].get());
+    size_t min_src_idx = op_2_temp_program_index[min_src_idx_op];
+    for (const auto& src_owned_op_call : source_pattern_graph.owned_op_call()) {
+      pir::Operation* src_owned_op =
+          src_match_ctx.IrOperation(src_owned_op_call.get());
+      size_t src_owned_op_idx = op_2_temp_program_index[src_owned_op];
+      if (min_src_idx > src_owned_op_idx) {
+        min_src_idx = src_owned_op_idx;
+        min_src_idx_op = src_owned_op;
+      }
+    }
 
-    size_t new_max_input_op_index = max_input_op_index + 1;
-    op_2_temp_program_index[new_op] = new_max_input_op_index;
-    if (new_max_input_op_index >= temp_program.size()) {
-      temp_program.emplace_back();
+    // 3. insert new op at point max(max_res_idx+1, min_src_idx)
+    if (min_src_idx > max_res_idx) {
+      rewriter.set_insertion_point(min_src_idx_op);
+    } else {
+      rewriter.SetInsertionPointAfter(max_res_idx_op);
     }
-    temp_program[new_max_input_op_index].push_back(new_op);
+
+    CreateOperation(op_call, src_match_ctx, rewriter, &res_match_ctx);
   });
 
   return res_match_ctx;
diff --git a/paddle/fluid/pir/serialize_deserialize/include/deserialize_utils.h b/paddle/fluid/pir/serialize_deserialize/include/deserialize_utils.h
index d4aaefe81c983..8ad7a0e13d3f2 100644
--- a/paddle/fluid/pir/serialize_deserialize/include/deserialize_utils.h
+++ b/paddle/fluid/pir/serialize_deserialize/include/deserialize_utils.h
@@ -17,17 +17,43 @@
 #include <string>
 #include <vector>
 
-#include "glog/logging.h"
 #include "paddle/common/layout.h"
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
 #include "paddle/fluid/pir/serialize_deserialize/include/schema.h"
-#include "paddle/fluid/pir/serialize_deserialize/include/third_part.h"
+#include "paddle/fluid/pir/serialize_deserialize/include/third_party.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/pir/include/core/builtin_attribute.h"
 #include "paddle/pir/include/core/builtin_type.h"
 
 namespace pir {
+#define DECOMPRESS_DIALECT_ID(name) \
+  pir::DialectIdMap::Instance()->GetDecompressDialectId(name)
+
+void GetDecompressOpName(std::string* op_name) {
+  std::pair<std::string, std::string> name = getContentSplitByDot(*op_name);
+  *op_name = DECOMPRESS_DIALECT_ID(name.first) + "." + name.second;
+  return;
+}
+
+class AttrTypeReader {
+ public:
+  static pir::Attribute ReadBuiltInAttr(const std::string attr_name,
+                                        Json* attr_json,
+                                        pir::IrContext* ctx);
+
+  static pir::Type ReadBuiltInType(const std::string type_name,
+                                   Json* type_json,
+                                   pir::IrContext* ctx);
+
+  static pir::Attribute ReadPaddleOperatorAttr(const std::string attr_name,
+                                               Json* attr_json,
+                                               pir::IrContext* ctx);
+
+  static pir::Type ReadPaddleOperatorType(const std::string type_name,
+                                          Json* type_json,
+                                          pir::IrContext* ctx);
+};
 
 template <typename T>
 T deserializeTypeFromJson(Json* type_json, pir::IrContext* ctx) {
@@ -147,78 +173,26 @@ deserializeAttrFromJson<paddle::dialect::PlaceAttribute, int8_t>(
 
 pir::Type parseType(Json* type_json) {
   auto type_name = type_json->at(ID).template get<std::string>();
-  pir::IrContext* ctx = pir::IrContext::Instance();
-
-  if (type_name == pir::BoolType::name()) {
-    VLOG(8) << "Parse BoolType ... ";
-    return pir::deserializeTypeFromJson<pir::BoolType>(type_json, ctx);
-  } else if (type_name == pir::BFloat16Type::name()) {
-    VLOG(8) << "Parse BFloat16Type ... ";
-    return pir::deserializeTypeFromJson<pir::BFloat16Type>(type_json, ctx);
-  } else if (type_name == pir::Float16Type::name()) {
-    VLOG(8) << "Parse Float16Type ... ";
-    return pir::deserializeTypeFromJson<pir::Float16Type>(type_json, ctx);
-  } else if (type_name == pir::Float32Type::name()) {
-    VLOG(8) << "Parse Float32Type ... ";
-    return pir::deserializeTypeFromJson<pir::Float32Type>(type_json, ctx);
-  } else if (type_name == pir::Float64Type::name()) {
-    VLOG(8) << "Parse Float64Type ... ";
-    return pir::deserializeTypeFromJson<pir::Float64Type>(type_json, ctx);
-  } else if (type_name == pir::Int8Type::name()) {
-    VLOG(8) << "Parse Int8Type ... ";
-    return pir::deserializeTypeFromJson<pir::Int8Type>(type_json, ctx);
-  } else if (type_name == pir::UInt8Type::name()) {
-    VLOG(8) << "Parse UInt8Type ... ";
-    return pir::deserializeTypeFromJson<pir::UInt8Type>(type_json, ctx);
-  } else if (type_name == pir::Int16Type::name()) {
-    VLOG(8) << "Parse Int16Type ... ";
-    return pir::deserializeTypeFromJson<pir::Int16Type>(type_json, ctx);
-  } else if (type_name == pir::Int32Type::name()) {
-    VLOG(8) << "Parse Int32Type ... ";
-    return pir::deserializeTypeFromJson<pir::Int32Type>(type_json, ctx);
-  } else if (type_name == pir::Int64Type::name()) {
-    VLOG(8) << "Parse Int64Type ... ";
-    return pir::deserializeTypeFromJson<pir::Int64Type>(type_json, ctx);
-  } else if (type_name == pir::IndexType::name()) {
-    VLOG(8) << "Parse IndexType ... ";
-    return pir::deserializeTypeFromJson<pir::IndexType>(type_json, ctx);
-  } else if (type_name == pir::Complex64Type::name()) {
-    VLOG(8) << "Parse Complex64Type ... ";
-    return pir::deserializeTypeFromJson<pir::Complex64Type>(type_json, ctx);
-  } else if (type_name == pir::Complex128Type::name()) {
-    VLOG(8) << "Parse Complex128Type ... ";
-    return pir::deserializeTypeFromJson<pir::Complex128Type>(type_json, ctx);
-  } else if (type_name == pir::VectorType::name()) {
-    VLOG(8) << "Parse VectorType ... ";
-    std::vector<pir::Type> content;
-    for (auto& type_x : type_json->at(DATA)) {
-      content.push_back(parseType(&type_x));
-    }
-    return pir::VectorType::get(ctx, content);
-  } else if (type_name == pir::DenseTensorType::name()) {
-    VLOG(8) << "Parse DenseTensorType ... ";
-    Json data_json = type_json->at(DATA);
-    pir::Type dtype = parseType(&(data_json.at(0)));
 
-    std::vector<int64_t> dims =
-        data_json.at(1).template get<std::vector<int64_t>>();
-    phi::DDim ddim = phi::make_ddim(dims);
-    pir::DataLayout data_layout =
-        common::StringToDataLayout(data_json.at(2).template get<std::string>());
+  if (type_name == NULL_TYPE) {
+    return pir::Type();
+  }
 
-    std::vector<std::vector<size_t>> lod =
-        data_json.at(3).template get<std::vector<std::vector<size_t>>>();
+  pir::IrContext* ctx = pir::IrContext::Instance();
+  std::pair<std::string, std::string> name = getContentSplitByDot(type_name);
 
-    size_t offset = data_json.at(4).get<size_t>();
-    return pir::DenseTensorType::get(
-        ctx, dtype, ddim, data_layout, lod, offset);
-  } else if (type_name == NULL_TYPE) {
-    return pir::Type();
+  if (DECOMPRESS_DIALECT_ID(name.first) == pir::BuiltinDialect::name()) {
+    return AttrTypeReader::ReadBuiltInType(name.second, type_json, ctx);
+  } else if (DECOMPRESS_DIALECT_ID(name.first) ==
+             paddle::dialect::OperatorDialect::name()) {
+    return AttrTypeReader::ReadPaddleOperatorType(name.second, type_json, ctx);
   } else {
-    PADDLE_ENFORCE(false,
-                   phi::errors::InvalidArgument(
-                       "Unknown Type %s for parse type", type_name));
+    PADDLE_ENFORCE(
+        false,
+        phi::errors::InvalidArgument(
+            "Unknown Attr %s for parse builtin dialect attr", type_name));
   }
+
   VLOG(8) << "Finish Parse Type ... ";
 
   return pir::Type();
@@ -234,7 +208,28 @@ pir::TypeAttribute deserializeAttrFromJson<pir::TypeAttribute, pir::Type>(
 pir::Attribute parseAttr(Json* attr_json) {
   std::string attr_name = attr_json->at(ID).template get<std::string>();
   pir::IrContext* ctx = pir::IrContext::Instance();
+  std::pair<std::string, std::string> name = getContentSplitByDot(attr_name);
+
+  if (DECOMPRESS_DIALECT_ID(name.first) == pir::BuiltinDialect::name()) {
+    return AttrTypeReader::ReadBuiltInAttr(name.second, attr_json, ctx);
+  } else if (DECOMPRESS_DIALECT_ID(name.first) ==
+             paddle::dialect::OperatorDialect::name()) {
+    return AttrTypeReader::ReadPaddleOperatorAttr(name.second, attr_json, ctx);
+  } else {
+    PADDLE_ENFORCE(
+        false,
+        phi::errors::InvalidArgument(
+            "Unknown Attr %s for parse builtin dialect attr", attr_name));
+  }
+
+  VLOG(8) << "Finish Parse Attr ... ";
 
+  return pir::Attribute();
+}
+
+pir::Attribute AttrTypeReader::ReadBuiltInAttr(const std::string attr_name,
+                                               Json* attr_json,
+                                               pir::IrContext* ctx) {
   if (attr_name == pir::BoolAttribute::name()) {
     VLOG(8) << "Parse BoolAttribute .";
     return pir::deserializeAttrFromJson<pir::BoolAttribute, bool>(attr_json,
@@ -286,7 +281,18 @@ pir::Attribute parseAttr(Json* attr_json) {
     VLOG(8) << "Parse StrAttribute .";
     return pir::deserializeAttrFromJson<pir::StrAttribute, std::string>(
         attr_json, ctx);
-  } else if (attr_name == paddle::dialect::IntArrayAttribute::name()) {
+  } else {
+    PADDLE_ENFORCE(
+        false,
+        phi::errors::InvalidArgument(
+            "Unknown Attr %s for parse builtin dialect attr", attr_name));
+  }
+  return pir::Attribute();
+}
+
+pir::Attribute AttrTypeReader::ReadPaddleOperatorAttr(
+    const std::string attr_name, Json* attr_json, pir::IrContext* ctx) {
+  if (attr_name == paddle::dialect::IntArrayAttribute::name()) {
     VLOG(8) << "Parse IntArrayAttribute .";
     return pir::deserializeAttrFromJson<paddle::dialect::IntArrayAttribute,
                                         std::vector<int64_t>>(attr_json, ctx);
@@ -306,11 +312,94 @@ pir::Attribute parseAttr(Json* attr_json) {
   } else {
     PADDLE_ENFORCE(false,
                    phi::errors::InvalidArgument(
-                       "Unknown Attr %s for parse attr", attr_name));
+                       "Unknown Attr %s for parse paddleoperator dialect attr",
+                       attr_name));
   }
-  VLOG(8) << "Finish Parse Attr ... ";
-
   return pir::Attribute();
 }
 
+pir::Type AttrTypeReader::ReadBuiltInType(const std::string type_name,
+                                          Json* type_json,
+                                          pir::IrContext* ctx) {
+  if (type_name == pir::BoolType::name()) {
+    VLOG(8) << "Parse BoolType ... ";
+    return pir::deserializeTypeFromJson<pir::BoolType>(type_json, ctx);
+  } else if (type_name == pir::BFloat16Type::name()) {
+    VLOG(8) << "Parse BFloat16Type ... ";
+    return pir::deserializeTypeFromJson<pir::BFloat16Type>(type_json, ctx);
+  } else if (type_name == pir::Float16Type::name()) {
+    VLOG(8) << "Parse Float16Type ... ";
+    return pir::deserializeTypeFromJson<pir::Float16Type>(type_json, ctx);
+  } else if (type_name == pir::Float32Type::name()) {
+    VLOG(8) << "Parse Float32Type ... ";
+    return pir::deserializeTypeFromJson<pir::Float32Type>(type_json, ctx);
+  } else if (type_name == pir::Float64Type::name()) {
+    VLOG(8) << "Parse Float64Type ... ";
+    return pir::deserializeTypeFromJson<pir::Float64Type>(type_json, ctx);
+  } else if (type_name == pir::Int8Type::name()) {
+    VLOG(8) << "Parse Int8Type ... ";
+    return pir::deserializeTypeFromJson<pir::Int8Type>(type_json, ctx);
+  } else if (type_name == pir::UInt8Type::name()) {
+    VLOG(8) << "Parse UInt8Type ... ";
+    return pir::deserializeTypeFromJson<pir::UInt8Type>(type_json, ctx);
+  } else if (type_name == pir::Int16Type::name()) {
+    VLOG(8) << "Parse Int16Type ... ";
+    return pir::deserializeTypeFromJson<pir::Int16Type>(type_json, ctx);
+  } else if (type_name == pir::Int32Type::name()) {
+    VLOG(8) << "Parse Int32Type ... ";
+    return pir::deserializeTypeFromJson<pir::Int32Type>(type_json, ctx);
+  } else if (type_name == pir::Int64Type::name()) {
+    VLOG(8) << "Parse Int64Type ... ";
+    return pir::deserializeTypeFromJson<pir::Int64Type>(type_json, ctx);
+  } else if (type_name == pir::IndexType::name()) {
+    VLOG(8) << "Parse IndexType ... ";
+    return pir::deserializeTypeFromJson<pir::IndexType>(type_json, ctx);
+  } else if (type_name == pir::Complex64Type::name()) {
+    VLOG(8) << "Parse Complex64Type ... ";
+    return pir::deserializeTypeFromJson<pir::Complex64Type>(type_json, ctx);
+  } else if (type_name == pir::Complex128Type::name()) {
+    VLOG(8) << "Parse Complex128Type ... ";
+    return pir::deserializeTypeFromJson<pir::Complex128Type>(type_json, ctx);
+  } else if (type_name == pir::VectorType::name()) {
+    VLOG(8) << "Parse VectorType ... ";
+    std::vector<pir::Type> content;
+    for (auto& type_x : type_json->at(DATA)) {
+      content.push_back(parseType(&type_x));
+    }
+    return pir::VectorType::get(ctx, content);
+  } else if (type_name == pir::DenseTensorType::name()) {
+    VLOG(8) << "Parse DenseTensorType ... ";
+    Json data_json = type_json->at(DATA);
+    pir::Type dtype = parseType(&(data_json.at(0)));
+
+    std::vector<int64_t> dims =
+        data_json.at(1).template get<std::vector<int64_t>>();
+    phi::DDim ddim = phi::make_ddim(dims);
+    pir::DataLayout data_layout =
+        common::StringToDataLayout(data_json.at(2).template get<std::string>());
+
+    std::vector<std::vector<size_t>> lod =
+        data_json.at(3).template get<std::vector<std::vector<size_t>>>();
+
+    size_t offset = data_json.at(4).get<size_t>();
+    return pir::DenseTensorType::get(
+        ctx, dtype, ddim, data_layout, lod, offset);
+  } else {
+    PADDLE_ENFORCE(false,
+                   phi::errors::InvalidArgument(
+                       "Unknown Type %s for parse builtintype", type_name));
+  }
+  return pir::Type();
+}
+
+pir::Type AttrTypeReader::ReadPaddleOperatorType(const std::string type_name,
+                                                 Json* type_json,
+                                                 pir::IrContext* ctx) {
+  PADDLE_ENFORCE(
+      false,
+      phi::errors::InvalidArgument(
+          "Unknown Type %s for parse paddleoperator dialect type", type_name));
+  return pir::Type();
+}
+
 }  // namespace pir
diff --git a/paddle/fluid/pir/serialize_deserialize/include/ir_deserialize.h b/paddle/fluid/pir/serialize_deserialize/include/ir_deserialize.h
index 2ae9f22d21a9c..ed8364bacc229 100644
--- a/paddle/fluid/pir/serialize_deserialize/include/ir_deserialize.h
+++ b/paddle/fluid/pir/serialize_deserialize/include/ir_deserialize.h
@@ -15,7 +15,7 @@
 
 #include <fstream>
 #include "paddle/common/enforce.h"
-#include "paddle/fluid/pir/serialize_deserialize/include/third_part.h"
+#include "paddle/fluid/pir/serialize_deserialize/include/third_party.h"
 #include "paddle/pir/include/core/operation.h"
 #include "paddle/pir/include/core/program.h"
 
@@ -47,6 +47,8 @@ class ProgramReader {
                                       Json* operesult_attrs_json);
   pir::Attribute ReadAttribute(Json* attr_json);
   pir::Type ReadType(Json* type_json);
+
+  pir::Operation* ReadParameterOp(Json* op_json);
 };
 
 }  // namespace pir
diff --git a/paddle/fluid/pir/serialize_deserialize/include/ir_serialize.h b/paddle/fluid/pir/serialize_deserialize/include/ir_serialize.h
index 96baf995d5aeb..ea58e51fbed0d 100644
--- a/paddle/fluid/pir/serialize_deserialize/include/ir_serialize.h
+++ b/paddle/fluid/pir/serialize_deserialize/include/ir_serialize.h
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
-#include "paddle/fluid/pir/serialize_deserialize/include/third_part.h"
+#include "paddle/fluid/pir/serialize_deserialize/include/third_party.h"
 #include "paddle/pir/include/core/program.h"
 
 namespace pir {
@@ -76,6 +76,9 @@ class ProgramWriter {
   Json WriteAttribute(const std::string& op_attr_name,
                       const pir::Attribute& attr);
   Json WriteType(const pir::Type& type);
+
+  // special op for optimize json file size
+  Json WriteParameterOP(const pir::Operation& op);
 };
 
 }  // namespace pir
diff --git a/paddle/fluid/pir/serialize_deserialize/include/schema.h b/paddle/fluid/pir/serialize_deserialize/include/schema.h
index dcfdfc09114a5..19e40053e36af 100644
--- a/paddle/fluid/pir/serialize_deserialize/include/schema.h
+++ b/paddle/fluid/pir/serialize_deserialize/include/schema.h
@@ -12,7 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
-
+#include "glog/logging.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/pir/include/core/builtin_dialect.h"
+#include "paddle/pir/include/dialect/control_flow/ir/cf_dialect.h"
 namespace pir {
 /**
  * IMPORTANT!!!
@@ -23,8 +26,8 @@ namespace pir {
 
 // all IR structure's identifier (region, block, op, attr, type value etc)
 // which can be string , int64_t etc.
-#define ID "id"
-
+#define ID "#"
+#define VALUE_ID "%"
 // program's key:
 #define REGIONS "regions"
 
@@ -43,9 +46,11 @@ namespace pir {
 #define BLOCKOPS "ops"
 
 // operation's key:
+// input
 // which is json array with opoperand json object(ID)
 #define OPOPERANDS "I"
 
+// output
 // which is json array with value json object(ID and TYPE_TYPE)
 #define OPRESULTS "O"
 
@@ -68,4 +73,27 @@ namespace pir {
 
 // NULL_TYPE
 #define NULL_TYPE "NULL"
+
+// special op compress
+
+#define PARAMETEROP "p"
+
+std::pair<std::string, std::string> getContentSplitByDot(
+    const std::string& str);
+
+class DialectIdMap {
+ public:
+  static DialectIdMap* Instance();
+  DialectIdMap();
+  void insert(const std::string& key, const std::string& value);
+
+  std::string GetCompressDialectId(const std::string& name);
+
+  std::string GetDecompressDialectId(const std::string& id);
+
+ private:
+  std::unordered_map<std::string, std::string> CompressDialect;
+  std::unordered_map<std::string, std::string> DecompressDialect;
+};
+
 }  // namespace pir
diff --git a/paddle/fluid/pir/serialize_deserialize/include/serialize_utils.h b/paddle/fluid/pir/serialize_deserialize/include/serialize_utils.h
index a6cae97f135d9..ddda2f5863f0c 100644
--- a/paddle/fluid/pir/serialize_deserialize/include/serialize_utils.h
+++ b/paddle/fluid/pir/serialize_deserialize/include/serialize_utils.h
@@ -18,16 +18,36 @@
 #include <string>
 #include <vector>
 
-#include "glog/logging.h"
-
+#include "paddle/common/layout.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
 #include "paddle/fluid/pir/serialize_deserialize/include/schema.h"
-#include "paddle/fluid/pir/serialize_deserialize/include/third_part.h"
+#include "paddle/fluid/pir/serialize_deserialize/include/third_party.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/pir/include/core/builtin_attribute.h"
 #include "paddle/pir/include/core/builtin_type.h"
 
 namespace pir {
+#define COMPRESS_DIALECT_NAME(attr_template)           \
+  pir::DialectIdMap::Instance()->GetCompressDialectId( \
+      (attr_template).dialect().name())
+
+void GetCompressOpName(std::string* op_name) {
+  std::pair<std::string, std::string> name = getContentSplitByDot(*op_name);
+  *op_name = pir::DialectIdMap::Instance()->GetCompressDialectId(name.first) +
+             "." + name.second;
+  return;
+}
+
+class AttrTypeWriter {
+ public:
+  static Json WriteBuiltInAttr(const pir::Attribute& attr);
+
+  static Json WriteBuiltInType(const pir::Type& type);
+
+  static Json WritePaddleOperatorAttr(const pir::Attribute& attr);
+
+  static Json WritePaddleOperatorType(const pir::Type& type);
+};
 /** serializeTypeToJson is a template function to serialize
  * a pir type to a json object. a pir type may have value or no value
  * Value free types only have ID, while value based types have
@@ -49,7 +69,7 @@ namespace pir {
 template <typename T>
 Json serializeTypeToJson(const T& type) {
   Json json_obj;
-  json_obj[ID] = type.name();
+  json_obj[ID] = COMPRESS_DIALECT_NAME(type) + "." + type.name();
   return json_obj;
 }
 
@@ -73,18 +93,18 @@ Json serializeTypeToJson(const T& type) {
 template <typename T>
 Json serializeAttrToJson(const T& attr) {
   Json json_obj;
-  json_obj[ID] = attr.name();
+  json_obj[ID] = COMPRESS_DIALECT_NAME(attr) + "." + attr.name();
   json_obj[DATA] = attr.data();
   return json_obj;
 }
 
-#define SERIALIZE_ATTR_TO_JSON(type, data)           \
-  template <>                                        \
-  Json serializeAttrToJson<type>(const type& attr) { \
-    Json json_obj;                                   \
-    json_obj[ID] = attr.name();                      \
-    json_obj[DATA] = data;                           \
-    return json_obj;                                 \
+#define SERIALIZE_ATTR_TO_JSON(type, data)                          \
+  template <>                                                       \
+  Json serializeAttrToJson<type>(const type& attr) {                \
+    Json json_obj;                                                  \
+    json_obj[ID] = COMPRESS_DIALECT_NAME(attr) + "." + attr.name(); \
+    json_obj[DATA] = data;                                          \
+    return json_obj;                                                \
   }
 
 SERIALIZE_ATTR_TO_JSON(pir::StrAttribute, attr.AsString());
@@ -97,12 +117,13 @@ SERIALIZE_ATTR_TO_JSON(paddle::dialect::IntArrayAttribute,
                        attr.data().GetData());
 SERIALIZE_ATTR_TO_JSON(paddle::dialect::DataTypeAttribute,
                        phi::DataTypeToString(attr.data()));
-
+SERIALIZE_ATTR_TO_JSON(paddle::dialect::DataLayoutAttribute,
+                       common::DataLayoutToString(attr.data()));
 template <>
 Json serializeAttrToJson<paddle::dialect::ScalarAttribute>(
     const paddle::dialect::ScalarAttribute& attr) {
   Json json_obj;
-  json_obj[ID] = attr.name();
+  json_obj[ID] = COMPRESS_DIALECT_NAME(attr) + "." + attr.name();
 
   Json content = Json::array();
   auto scalar = attr.data();
@@ -151,7 +172,7 @@ template <>
 Json serializeAttrToJson<paddle::dialect::PlaceAttribute>(
     const paddle::dialect::PlaceAttribute& attr) {
   Json json_obj;
-  json_obj[ID] = attr.name();
+  json_obj[ID] = COMPRESS_DIALECT_NAME(attr) + "." + attr.name();
   Json content = Json::array();
   auto place = attr.data();
   content.push_back(static_cast<int8_t>(place.GetType()));
@@ -162,6 +183,112 @@ Json serializeAttrToJson<paddle::dialect::PlaceAttribute>(
 }
 
 Json writeType(const pir::Type& type) {
+  Json type_json = Json::object();
+  if (!type) {
+    type_json[ID] = NULL_TYPE;
+    return type_json;
+  }
+  if (type.dialect().name() == pir::BuiltinDialect::name()) {
+    VLOG(6) << "write BuiltinType ... ";
+    return AttrTypeWriter::WriteBuiltInType(type);
+  } else if (type.dialect().name() ==
+             paddle::dialect::OperatorDialect::name()) {
+    VLOG(6) << "write PaddleOperatorType ... ";
+    return AttrTypeWriter::WritePaddleOperatorType(type);
+  } else {
+    PADDLE_ENFORCE(
+        false, phi::errors::InvalidArgument("Unknown Type %s when write type"));
+  }
+  VLOG(8) << "Finish write Type ... ";
+
+  return type_json;
+}
+
+SERIALIZE_ATTR_TO_JSON(pir::TypeAttribute, writeType(attr.data()));
+
+Json writeAttr(const pir::Attribute& attr) {
+  if (attr.dialect().name() == pir::BuiltinDialect::name()) {
+    VLOG(8) << "write BuiltinAttr ... ";
+    return AttrTypeWriter::WriteBuiltInAttr(attr);
+  } else if (attr.dialect().name() ==
+             paddle::dialect::OperatorDialect::name()) {
+    VLOG(8) << "write PaddleOperatorAttr ... ";
+    return AttrTypeWriter::WritePaddleOperatorAttr(attr);
+  } else {
+    PADDLE_ENFORCE(
+        false, phi::errors::InvalidArgument("Unknown Attr %s when write attr"));
+  }
+
+  VLOG(8) << "Finish write attr ... ";
+
+  return Json::object();
+}
+
+Json AttrTypeWriter::WriteBuiltInAttr(const pir::Attribute& attr) {
+  Json attr_json = Json::object();
+  if (attr.isa<pir::BoolAttribute>()) {
+    VLOG(8) << "write BoolAttribute .";
+    return pir::serializeAttrToJson<pir::BoolAttribute>(
+        attr.dyn_cast<pir::BoolAttribute>());
+  } else if (attr.isa<pir::FloatAttribute>()) {
+    VLOG(8) << "write FloatAttribute .";
+    return pir::serializeAttrToJson<pir::FloatAttribute>(
+        attr.dyn_cast<pir::FloatAttribute>());
+  } else if (attr.isa<pir::DoubleAttribute>()) {
+    VLOG(8) << "write DoubleAttribute .";
+    return pir::serializeAttrToJson<pir::DoubleAttribute>(
+        attr.dyn_cast<pir::DoubleAttribute>());
+  } else if (attr.isa<pir::Int32Attribute>()) {
+    VLOG(8) << "write Int32Attribute .";
+    return pir::serializeAttrToJson<pir::Int32Attribute>(
+        attr.dyn_cast<pir::Int32Attribute>());
+  } else if (attr.isa<pir::Int64Attribute>()) {
+    VLOG(8) << "write Int64Attribute .";
+    return pir::serializeAttrToJson<pir::Int64Attribute>(
+        attr.dyn_cast<pir::Int64Attribute>());
+  } else if (attr.isa<pir::IndexAttribute>()) {
+    VLOG(8) << "write IndexAttribute .";
+    return pir::serializeAttrToJson<pir::IndexAttribute>(
+        attr.dyn_cast<pir::IndexAttribute>());
+  } else if (attr.isa<pir::ArrayAttribute>()) {
+    VLOG(8) << "write ArrayAttribute .";
+    auto attr_ = attr.dyn_cast<pir::ArrayAttribute>();
+    Json val = Json::array();
+    for (size_t i = 0; i < attr_.size(); i++) {
+      val.push_back(writeAttr(attr_.at(i)));
+    }
+    attr_json[ID] = COMPRESS_DIALECT_NAME(attr_) + "." + attr_.name();
+    attr_json[DATA] = val;
+    return attr_json;
+  } else if (attr.isa<pir::TypeAttribute>()) {
+    VLOG(8) << "write TypeAttribute .";
+    return pir::serializeAttrToJson<pir::TypeAttribute>(
+        attr.dyn_cast<pir::TypeAttribute>());
+  } else if (attr.isa<pir::TensorNameAttribute>()) {
+    VLOG(8) << "write TensorNameAttribute .";
+    return pir::serializeAttrToJson<pir::TensorNameAttribute>(
+        attr.dyn_cast<pir::TensorNameAttribute>());
+  } else if (attr.isa<pir::Complex64Attribute>()) {
+    VLOG(8) << "write Complex64Attribute .";
+    return pir::serializeAttrToJson<pir::Complex64Attribute>(
+        attr.dyn_cast<pir::Complex64Attribute>());
+  } else if (attr.isa<pir::Complex128Attribute>()) {
+    VLOG(8) << "write Complex128Attribute .";
+    return pir::serializeAttrToJson<pir::Complex128Attribute>(
+        attr.dyn_cast<pir::Complex128Attribute>());
+  } else if (attr.isa<pir::StrAttribute>()) {
+    VLOG(8) << "write StrAttribute .";
+    return pir::serializeAttrToJson<pir::StrAttribute>(
+        attr.dyn_cast<pir::StrAttribute>());
+  } else {
+    PADDLE_ENFORCE(false,
+                   phi::errors::InvalidArgument(
+                       "Unknown Attr %s when write Buitin dialect attr"));
+  }
+  return attr_json;
+}
+
+Json AttrTypeWriter::WriteBuiltInType(const pir::Type& type) {
   Json type_json = Json::object();
   if (type.isa<pir::BoolType>()) {
     VLOG(8) << "Write BoolType ... ";
@@ -220,7 +347,7 @@ Json writeType(const pir::Type& type) {
   } else if (type.isa<pir::VectorType>()) {
     VLOG(8) << "Write VectorType ... ";
     auto type_ = type.dyn_cast<pir::VectorType>();
-    type_json[ID] = type_.name();
+    type_json[ID] = COMPRESS_DIALECT_NAME(type_) + "." + type_.name();
     Json content = Json::array();
     for (auto type_x : type_.data()) {
       content.push_back(writeType(type_x));
@@ -231,7 +358,7 @@ Json writeType(const pir::Type& type) {
     VLOG(8) << "Write DenseTensorType ... ";
     auto type_ = type.dyn_cast<pir::DenseTensorType>();
 
-    type_json[ID] = type_.name();
+    type_json[ID] = COMPRESS_DIALECT_NAME(type_) + "." + type_.name();
     Json content = Json::array();
     content.push_back(writeType(type_.dtype()));
 
@@ -248,77 +375,16 @@ Json writeType(const pir::Type& type) {
     content.push_back(type_.offset());
     type_json[DATA] = content;
     return type_json;
-  } else if (!type) {
-    type_json[ID] = NULL_TYPE;
-    return type_json;
   } else {
-    PADDLE_ENFORCE(
-        false, phi::errors::InvalidArgument("Unknown Type when write type"));
+    PADDLE_ENFORCE(false,
+                   phi::errors::InvalidArgument(
+                       "Unknown Type when write builtin dialect type"));
   }
-  VLOG(8) << "Finish write Type ... ";
-
   return type_json;
 }
 
-SERIALIZE_ATTR_TO_JSON(pir::TypeAttribute, writeType(attr.data()));
-
-Json writeAttr(const pir::Attribute& attr) {
-  Json attr_json = Json::object();
-  if (attr.isa<pir::BoolAttribute>()) {
-    VLOG(8) << "write BoolAttribute .";
-    return pir::serializeAttrToJson<pir::BoolAttribute>(
-        attr.dyn_cast<pir::BoolAttribute>());
-  } else if (attr.isa<pir::FloatAttribute>()) {
-    VLOG(8) << "write FloatAttribute .";
-    return pir::serializeAttrToJson<pir::FloatAttribute>(
-        attr.dyn_cast<pir::FloatAttribute>());
-  } else if (attr.isa<pir::DoubleAttribute>()) {
-    VLOG(8) << "write DoubleAttribute .";
-    return pir::serializeAttrToJson<pir::DoubleAttribute>(
-        attr.dyn_cast<pir::DoubleAttribute>());
-  } else if (attr.isa<pir::Int32Attribute>()) {
-    VLOG(8) << "write Int32Attribute .";
-    return pir::serializeAttrToJson<pir::Int32Attribute>(
-        attr.dyn_cast<pir::Int32Attribute>());
-  } else if (attr.isa<pir::Int64Attribute>()) {
-    VLOG(8) << "write Int64Attribute .";
-    return pir::serializeAttrToJson<pir::Int64Attribute>(
-        attr.dyn_cast<pir::Int64Attribute>());
-  } else if (attr.isa<pir::IndexAttribute>()) {
-    VLOG(8) << "write IndexAttribute .";
-    return pir::serializeAttrToJson<pir::IndexAttribute>(
-        attr.dyn_cast<pir::IndexAttribute>());
-  } else if (attr.isa<pir::ArrayAttribute>()) {
-    VLOG(8) << "write ArrayAttribute .";
-    auto attr_ = attr.dyn_cast<pir::ArrayAttribute>();
-    Json val = Json::array();
-    for (size_t i = 0; i < attr_.size(); i++) {
-      val.push_back(writeAttr(attr_.at(i)));
-    }
-    attr_json[ID] = attr_.name();
-    attr_json[DATA] = val;
-    return attr_json;
-  } else if (attr.isa<pir::TypeAttribute>()) {
-    VLOG(8) << "write TypeAttribute .";
-    return pir::serializeAttrToJson<pir::TypeAttribute>(
-        attr.dyn_cast<pir::TypeAttribute>());
-  } else if (attr.isa<pir::TensorNameAttribute>()) {
-    VLOG(8) << "write TensorNameAttribute .";
-    return pir::serializeAttrToJson<pir::TensorNameAttribute>(
-        attr.dyn_cast<pir::TensorNameAttribute>());
-  } else if (attr.isa<pir::Complex64Attribute>()) {
-    VLOG(8) << "write Complex64Attribute .";
-    return pir::serializeAttrToJson<pir::Complex64Attribute>(
-        attr.dyn_cast<pir::Complex64Attribute>());
-  } else if (attr.isa<pir::Complex128Attribute>()) {
-    VLOG(8) << "write Complex128Attribute .";
-    return pir::serializeAttrToJson<pir::Complex128Attribute>(
-        attr.dyn_cast<pir::Complex128Attribute>());
-  } else if (attr.isa<pir::StrAttribute>()) {
-    VLOG(8) << "write StrAttribute .";
-    return pir::serializeAttrToJson<pir::StrAttribute>(
-        attr.dyn_cast<pir::StrAttribute>());
-  } else if (attr.isa<paddle::dialect::IntArrayAttribute>()) {
+Json AttrTypeWriter::WritePaddleOperatorAttr(const pir::Attribute& attr) {
+  if (attr.isa<paddle::dialect::IntArrayAttribute>()) {
     VLOG(8) << "write IntArrayAttribute .";
     return pir::serializeAttrToJson<paddle::dialect::IntArrayAttribute>(
         attr.dyn_cast<paddle::dialect::IntArrayAttribute>());
@@ -334,13 +400,25 @@ Json writeAttr(const pir::Attribute& attr) {
     VLOG(8) << "write PlaceAttribute .";
     return pir::serializeAttrToJson<paddle::dialect::PlaceAttribute>(
         attr.dyn_cast<paddle::dialect::PlaceAttribute>());
+  } else if (attr.isa<paddle::dialect::DataLayoutAttribute>()) {
+    VLOG(8) << "write DataLayoutAttribute .";
+    return pir::serializeAttrToJson<paddle::dialect::DataLayoutAttribute>(
+        attr.dyn_cast<paddle::dialect::DataLayoutAttribute>());
   } else {
     PADDLE_ENFORCE(
-        false, phi::errors::InvalidArgument("Unknown Attr %s when write attr"));
+        false,
+        phi::errors::InvalidArgument(
+            "Unknown Attr %s when write paddle.operatordialect attr"));
   }
-  VLOG(8) << "Finish write& attr ... ";
+  return Json::object();
+}
 
-  return attr_json;
+Json AttrTypeWriter::WritePaddleOperatorType(const pir::Type& type) {
+  PADDLE_ENFORCE(false,
+                 phi::errors::InvalidArgument(
+                     "Unknown Type when write paddle.operatordialect type"));
+
+  return Json::object();
 }
 
 }  // namespace pir
diff --git a/paddle/fluid/pir/serialize_deserialize/include/third_part.h b/paddle/fluid/pir/serialize_deserialize/include/third_party.h
similarity index 100%
rename from paddle/fluid/pir/serialize_deserialize/include/third_part.h
rename to paddle/fluid/pir/serialize_deserialize/include/third_party.h
diff --git a/paddle/fluid/pir/serialize_deserialize/src/ir_deserialize.cc b/paddle/fluid/pir/serialize_deserialize/src/ir_deserialize.cc
index 44c68051bf908..9b94498a13f79 100644
--- a/paddle/fluid/pir/serialize_deserialize/src/ir_deserialize.cc
+++ b/paddle/fluid/pir/serialize_deserialize/src/ir_deserialize.cc
@@ -88,15 +88,72 @@ void ProgramReader::ReadBlock(Json* block_json, pir::Block* block) {
   VLOG(4) << "Finish Read " << block_name << ".";
   return;
 }
+pir::ArrayAttribute GetOneBoolArrayAttribute(pir::IrContext* ctx,
+                                             Json* attr_json) {
+  std::vector<pir::Attribute> val;
+  bool bool_value = attr_json->template get<int32_t>() != 0;
+  val.push_back(pir::BoolAttribute::get(ctx, bool_value));
+  return pir::ArrayAttribute::get(ctx, val);
+}
+
+pir::Operation* ProgramReader::ReadParameterOp(Json* op_json) {
+  // attr is_distributed; is_parameter; need_clip; parameter_name; persistable;
+  // stop_gradient; trainable;
+  std::vector<pir::Value> inputs;
+  Json& opresult_json = op_json->at(OPRESULTS);
+  std::vector<pir::Type> output_types;
+
+  int64_t value_id_ = opresult_json.at(VALUE_ID).template get<int64_t>();
+  output_types.push_back(ReadType(&(opresult_json.at(TYPE_TYPE))));
+  VLOG(6) << "Finish Read value " << value_id_ << ".";
+
+  Json& attrs_json = op_json->at(ATTRS);
+  pir::AttributeMap attributes;
+  pir::IrContext* ctx = pir::IrContext::Instance();
+  attributes.insert(
+      {"is_distributed", GetOneBoolArrayAttribute(ctx, &attrs_json.at(0))});
+  attributes.insert(
+      {"is_parameter", GetOneBoolArrayAttribute(ctx, &attrs_json.at(1))});
+  attributes.insert(
+      {"need_clip", GetOneBoolArrayAttribute(ctx, &attrs_json.at(2))});
+  attributes.insert({"parameter_name",
+                     pir::StrAttribute::get(
+                         ctx, attrs_json.at(3).template get<std::string>())});
+
+  if (op_json->contains(OPRESULTS_ATTRS)) {
+    Json& other_attrs_json = op_json->at(OPRESULTS_ATTRS);
+    attributes.insert({"persistable",
+                       GetOneBoolArrayAttribute(ctx, &other_attrs_json.at(0))});
+    attributes.insert({"stop_gradient",
+                       GetOneBoolArrayAttribute(ctx, &other_attrs_json.at(1))});
+    attributes.insert(
+        {"trainable", GetOneBoolArrayAttribute(ctx, &other_attrs_json.at(2))});
+  }
+
+  pir::IrContext* ctx_ = pir::IrContext::Instance();
+  // prepare opinfo
+  pir::OpInfo op_info = ctx_->GetRegisteredOpInfo(pir::ParameterOp::name());
+  // deserialize op
+  pir::Operation* op =
+      Operation::Create(inputs, attributes, output_types, op_info);
+
+  id_value_map[value_id_] = op->result(0);
+  VLOG(4) << "Finish Read Operation " << op->name() << ".";
+  return op;
+}
 
 pir::Operation* ProgramReader::ReadOp(Json* op_json) {
   auto op_name = op_json->at(ID).template get<std::string>();
+  if (op_name == PARAMETEROP) {
+    return ReadParameterOp(op_json);
+  }
+  GetDecompressOpName(&op_name);
   VLOG(4) << "Read op_name = " << op_name << ".";
   // deserialize opoperands (find value)
   Json& operands_json = op_json->at(OPOPERANDS);
   std::vector<pir::Value> inputs;
   for (auto& operand_json : operands_json) {
-    int64_t id = operand_json.at(ID).template get<int64_t>();
+    int64_t id = operand_json.at(VALUE_ID).template get<int64_t>();
     inputs.push_back(id_value_map[id]);
   }
   VLOG(6) << "Finish Read OP's OpOperand.";
@@ -105,7 +162,7 @@ pir::Operation* ProgramReader::ReadOp(Json* op_json) {
   std::vector<pir::Type> output_types;
   std::vector<int64_t> output_ids;
   for (auto& opresult_json : opresults_json) {
-    int64_t value_id_ = opresult_json.at(ID).template get<int64_t>();
+    int64_t value_id_ = opresult_json.at(VALUE_ID).template get<int64_t>();
     output_ids.push_back(value_id_);
     output_types.push_back(ReadType(&(opresult_json.at(TYPE_TYPE))));
     VLOG(6) << "Finish Read value " << value_id_ << ".";
diff --git a/paddle/fluid/pir/serialize_deserialize/src/ir_serialize.cc b/paddle/fluid/pir/serialize_deserialize/src/ir_serialize.cc
index 35219cc2a7d77..037974efaeea3 100644
--- a/paddle/fluid/pir/serialize_deserialize/src/ir_serialize.cc
+++ b/paddle/fluid/pir/serialize_deserialize/src/ir_serialize.cc
@@ -135,11 +135,11 @@ Json ProgramWriter::WriteValue(const pir::Value& value) {
   Json var_json;
   if (value) {
     value_id_map[value] = value_id_;
-    var_json[ID] = value_id_;
+    var_json[VALUE_ID] = value_id_;
     VLOG(6) << "Finish write value " << value_id_ << ".";
     value_id_++;
   } else {
-    var_json[ID] = 0;  // NULL_TYPE
+    var_json[VALUE_ID] = 0;  // NULL_TYPE
     VLOG(6) << "Finish write NULL_TYPE value.";
   }
 
@@ -149,9 +149,58 @@ Json ProgramWriter::WriteValue(const pir::Value& value) {
   return var_json;
 }
 
+#define ONE_BOOL_ARRAY_ATTRIBUTE_CAST_TEMPLATE(attr_name)   \
+  static_cast<int32_t>(op.attributes()                      \
+                           .at(attr_name)                   \
+                           .dyn_cast<pir::ArrayAttribute>() \
+                           .at(0)                           \
+                           .dyn_cast<pir::BoolAttribute>()  \
+                           .data())
+Json ProgramWriter::WriteParameterOP(const pir::Operation& op) {
+  // attr_name ; type
+  // is_distributed; array(bool)
+  // is_parameter; array(bool)
+  // need_clip; array(bool)
+  // parameter_name; string
+  // persistable; array(bool)
+  // stop_gradient; array(bool)
+  // trainable; array(bool)
+  Json op_json = Json::object();
+  op_json[ID] = PARAMETEROP;
+  // serialize opoperands
+  VLOG(4) << "Begin write Operation " << op.name() << ".";
+  op_json[OPRESULTS] = WriteValue(op.result(0));
+  Json attrs_json = Json::array();
+  attrs_json.emplace_back(
+      ONE_BOOL_ARRAY_ATTRIBUTE_CAST_TEMPLATE("is_distributed"));
+  attrs_json.emplace_back(
+      ONE_BOOL_ARRAY_ATTRIBUTE_CAST_TEMPLATE("is_parameter"));
+  attrs_json.emplace_back(ONE_BOOL_ARRAY_ATTRIBUTE_CAST_TEMPLATE("need_clip"));
+  attrs_json.emplace_back(op.attributes()
+                              .at("parameter_name")
+                              .dyn_cast<pir::StrAttribute>()
+                              .AsString());
+  op_json[ATTRS] = attrs_json;
+  Json other_attrs_json = Json::array();
+  other_attrs_json.emplace_back(
+      ONE_BOOL_ARRAY_ATTRIBUTE_CAST_TEMPLATE("persistable"));
+  other_attrs_json.emplace_back(
+      ONE_BOOL_ARRAY_ATTRIBUTE_CAST_TEMPLATE("stop_gradient"));
+  other_attrs_json.emplace_back(
+      ONE_BOOL_ARRAY_ATTRIBUTE_CAST_TEMPLATE("trainable"));
+  if (trainable_) {
+    op_json[OPRESULTS_ATTRS] = other_attrs_json;
+  }
+  return op_json;
+}
 Json ProgramWriter::WriteOp(const pir::Operation& op) {
+  if (op.isa<pir::ParameterOp>()) {
+    return WriteParameterOP(op);
+  }
   Json op_json = Json::object();
-  op_json[ID] = op.name();
+  auto op_name = op.name();
+  GetCompressOpName(&op_name);
+  op_json[ID] = op_name;
   // serialize opoperands
   VLOG(4) << "Begin write Operation " << op.name() << ".";
   Json operands_json = Json::array();
@@ -195,10 +244,10 @@ Json ProgramWriter::WriteOpOperand(const pir::OpOperand& op_operand) {
   Json operand_json = Json::object();
   if (op_operand.source()) {
     int64_t id = value_id_map[op_operand.source()];
-    operand_json[ID] = id;
+    operand_json[VALUE_ID] = id;
     VLOG(6) << "Finish write OpOperand " << id << ".";
   } else {
-    operand_json[ID] = 0;  // NULL_VALUE
+    operand_json[VALUE_ID] = 0;  // NULL_VALUE
     VLOG(6) << "Finish write NULL_VALUE OpOperand.";
   }
 
diff --git a/paddle/fluid/pir/serialize_deserialize/src/schema.cc b/paddle/fluid/pir/serialize_deserialize/src/schema.cc
new file mode 100644
index 0000000000000..770260d025f91
--- /dev/null
+++ b/paddle/fluid/pir/serialize_deserialize/src/schema.cc
@@ -0,0 +1,68 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pir/serialize_deserialize/include/schema.h"
+#include "paddle/phi/core/enforce.h"
+namespace pir {
+
+std::pair<std::string, std::string> getContentSplitByDot(
+    const std::string& str) {
+  size_t pos = str.find('.');
+  if (pos == std::string::npos) {
+    return {str, str};
+  }
+  return {str.substr(0, pos), str.substr(pos + 1)};
+}
+
+DialectIdMap* DialectIdMap::Instance() {
+  static DialectIdMap map;
+  return &map;
+}
+DialectIdMap::DialectIdMap() {
+  insert(pir::BuiltinDialect::name(), "0");
+  insert(paddle::dialect::OperatorDialect::name(), "1");
+  insert(pir::ControlFlowDialect::name(), "2");
+  insert(paddle::dialect::CustomOpDialect::name(), "3");
+}
+void DialectIdMap::insert(const std::string& key, const std::string& value) {
+  CompressDialect[key] = value;
+  DecompressDialect[value] = key;
+}
+
+std::string DialectIdMap::GetCompressDialectId(const std::string& name) {
+  if (CompressDialect.find(name) != CompressDialect.end()) {
+    return CompressDialect[name];
+  } else {
+    VLOG(0) << "can't find dialect " << name
+            << "'s compress id, return original dialectname, it's better to "
+               "insert compress id in DialectIdMap() func";
+    return name;
+  }
+  return "";
+}
+
+std::string DialectIdMap::GetDecompressDialectId(const std::string& id) {
+  if (DecompressDialect.find(id) != DecompressDialect.end()) {
+    return DecompressDialect[id];
+  } else {
+    PADDLE_ENFORCE(
+        false,
+        phi::errors::InvalidArgument(
+            "Unknown id %s for decompress dialect, pleace check your file",
+            id));
+  }
+  return "";
+}
+
+}  // namespace pir
diff --git a/paddle/fluid/pir/transforms/xpu/group_norm_silu_fuse_pass.cc b/paddle/fluid/pir/transforms/general/group_norm_silu_fuse_pass.cc
similarity index 51%
rename from paddle/fluid/pir/transforms/xpu/group_norm_silu_fuse_pass.cc
rename to paddle/fluid/pir/transforms/general/group_norm_silu_fuse_pass.cc
index 0b93a496d4dde..2e46903b1734f 100644
--- a/paddle/fluid/pir/transforms/xpu/group_norm_silu_fuse_pass.cc
+++ b/paddle/fluid/pir/transforms/general/group_norm_silu_fuse_pass.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/pir/transforms/xpu/group_norm_silu_fuse_pass.h"
+#include "paddle/fluid/pir/transforms/general/group_norm_silu_fuse_pass.h"
 
 #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
 #include "paddle/fluid/pir/drr/include/drr_pattern_base.h"
@@ -37,11 +37,19 @@ For example:
                       |
                     output
 ------------------------------------------------------
-After the pass is applied:
+After the pass is applied:XPU
                       X
               Scale   |   Bias
                    \  |  /
-                group_norm_silu
+                group_norm_silu_xpu
+                      |
+                     Out
+------------------------------------------------------
+After the pass is applied:GPU
+                      X
+              Scale   |   Bias
+                   \  |  /
+                add_group_norm_silu
                       |
                      Out
 */
@@ -54,31 +62,62 @@ class GroupNormSiluPattern : public paddle::drr::DrrPatternBase {
 
   void operator()(paddle::drr::DrrPatternContext *ctx) const override {
     paddle::drr::SourcePattern pat = ctx->SourcePattern();
-    const auto &groupnorm = pat.Op(
-        paddle::dialect::GroupNormOp::name(),
-        {{"epsilon", pat.Attr("epsilon")}, {"groups", pat.Attr("groups")}});
+    const auto &group_norm = pat.Op(paddle::dialect::GroupNormOp::name(),
+                                    {{"epsilon", pat.Attr("epsilon")},
+                                     {"groups", pat.Attr("groups")},
+                                     {"data_format", pat.Attr("data_format")}});
 
     const auto &silu = pat.Op(paddle::dialect::SiluOp::name());
 
-    groupnorm({&pat.Tensor("X"), &pat.Tensor("Scale"), &pat.Tensor("Bias")},
-              {&pat.Tensor("Y"), &pat.Tensor("Mean"), &pat.Tensor("Variance")});
+    group_norm(
+        {&pat.Tensor("X"), &pat.Tensor("Scale"), &pat.Tensor("Bias")},
+        {&pat.Tensor("Y"), &pat.Tensor("Mean"), &pat.Tensor("Variance")});
     silu({&pat.Tensor("Y")}, {&pat.Tensor("Out")});
 
+#ifdef PADDLE_WITH_CUDA
+    pat.AddConstraint([this](const paddle::drr::MatchContext &match_ctx) {
+      auto x_dtype = pir::GetDataTypeFromValue(match_ctx.Tensor("X"));
+      if (!x_dtype.isa<pir::Float16Type>() &&
+          !x_dtype.isa<pir::BFloat16Type>()) {
+        return false;
+      }
+      return true;
+    });
+#endif
+
+#ifdef PADDLE_WITH_CUDA
+    paddle::drr::ResultPattern res = pat.ResultPattern();
+    const auto &add_group_norm_silu_op =
+        res.Op(paddle::dialect::AddGroupNormSiluOp::name(),
+               {{"epsilon", pat.Attr("epsilon")},
+                {"groups", pat.Attr("groups")},
+                {"data_format", pat.Attr("data_format")},
+                {"activation", res.StrAttr("silu")}});
+    add_group_norm_silu_op({&res.Tensor("X"),
+                            &res.InputNoneTensor(),
+                            &res.Tensor("Scale"),
+                            &res.Tensor("Bias")},
+                           {&res.Tensor("Out"),
+                            &res.OutputNoneTensor(),
+                            &res.Tensor("Mean"),
+                            &res.Tensor("Variance")});
+#endif
+#ifdef PADDLE_WITH_XPU
     paddle::drr::ResultPattern res = pat.ResultPattern();
-
     const auto &group_norm_silu_xpu = res.Op(
         paddle::dialect::GroupNormSiluXpuOp::name(),
         {{{"epsilon", pat.Attr("epsilon")}, {"groups", pat.Attr("groups")}}});
     group_norm_silu_xpu(
         {&res.Tensor("X"), &res.Tensor("Scale"), &res.Tensor("Bias")},
         {&res.Tensor("Out")});
+#endif
   }
 };
 
-class GroupNormSiluXpuFusePass : public pir::PatternRewritePass {
+class GroupNormSiluFusePass : public pir::PatternRewritePass {
  public:
-  GroupNormSiluXpuFusePass()
-      : pir::PatternRewritePass("group_norm_silu_xpu_fuse_pass", 2) {}
+  GroupNormSiluFusePass()
+      : pir::PatternRewritePass("group_norm_silu_fuse_pass", 2) {}
 
   pir::RewritePatternSet InitializePatterns(pir::IrContext *context) override {
     pir::RewritePatternSet ps(context);
@@ -90,10 +129,10 @@ class GroupNormSiluXpuFusePass : public pir::PatternRewritePass {
 }  // namespace
 
 namespace pir {
-std::unique_ptr<Pass> CreateGroupNormSiluXpuFusePass() {
-  return std::make_unique<GroupNormSiluXpuFusePass>();
+std::unique_ptr<Pass> CreateGroupNormSiluFusePass() {
+  return std::make_unique<GroupNormSiluFusePass>();
 }
 
 }  // namespace pir
 
-REGISTER_IR_PASS(group_norm_silu_xpu_fuse_pass, GroupNormSiluXpuFusePass);
+REGISTER_IR_PASS(group_norm_silu_fuse_pass, GroupNormSiluFusePass);
diff --git a/paddle/fluid/pir/transforms/xpu/group_norm_silu_fuse_pass.h b/paddle/fluid/pir/transforms/general/group_norm_silu_fuse_pass.h
similarity index 92%
rename from paddle/fluid/pir/transforms/xpu/group_norm_silu_fuse_pass.h
rename to paddle/fluid/pir/transforms/general/group_norm_silu_fuse_pass.h
index 665c7dcb03f16..475444a19c424 100644
--- a/paddle/fluid/pir/transforms/xpu/group_norm_silu_fuse_pass.h
+++ b/paddle/fluid/pir/transforms/general/group_norm_silu_fuse_pass.h
@@ -21,6 +21,6 @@ namespace pir {
 
 class Pass;
 
-IR_API std::unique_ptr<Pass> CreateGroupNormSiluXpuFusePass();
+IR_API std::unique_ptr<Pass> CreateGroupNormSiluFusePass();
 
 }  // namespace pir
diff --git a/paddle/fluid/pir/transforms/general/transfer_layout_pass.cc b/paddle/fluid/pir/transforms/general/transfer_layout_pass.cc
index fcbfcbb910e1e..61113f8e9dfc5 100644
--- a/paddle/fluid/pir/transforms/general/transfer_layout_pass.cc
+++ b/paddle/fluid/pir/transforms/general/transfer_layout_pass.cc
@@ -278,7 +278,7 @@ struct FlowGraph {
       }
     }
 
-    std::unordered_set<Node> nhwc_nodes;
+    std::unordered_set<Node> mutable_nodes;
     for (auto& op : *(program.block())) {
       auto layout_transform_iface =
           op.dyn_cast<paddle::dialect::LayoutTransformationInterface>();
@@ -286,10 +286,14 @@ struct FlowGraph {
         continue;
       }
 
+      if (!layout_transform_iface.CanBeModified(&op)) {
+        continue;
+      }
+
       auto prefer_layout = layout_transform_iface.PreferLayout(&op);
       if (prefer_layout == common::DataLayout::NHWC) {
         Node op_node(&op);
-        nhwc_nodes.insert(op_node);
+        mutable_nodes.insert(op_node);
         AddEdge(op_node, dst_node(), INF);
         VLOG(10) << "[PreProcess] node: " << op_node
                  << " should be set to NHWC";
@@ -302,7 +306,7 @@ struct FlowGraph {
     // operation who have a dertermined layout and spread its layout to
     // its output and inputs recursively.
     std::queue<Node> q;
-    for (auto& n : nhwc_nodes) {
+    for (auto& n : mutable_nodes) {
       q.push(n);
     }
     std::unordered_set<Node> is_node_layout_visited;
@@ -362,13 +366,14 @@ struct FlowGraph {
                   // a point of cut edge. So we set its outputs and inputs to
                   // immutable.
                   Node in_node = Node(v.defining_op());
-                  nhwc_nodes.erase(in_node);
-                  VLOG(10) << "erase node: " << in_node << " from nhwc set";
+                  mutable_nodes.erase(in_node);
+                  VLOG(10) << "erase node: " << in_node << " from mutable set";
 
                   for (auto it = v.use_begin(); it != v.use_end(); ++it) {
                     Node out_node(it->owner());
-                    nhwc_nodes.erase(out_node);
-                    VLOG(10) << "erase node: " << out_node << " from nhwc set";
+                    mutable_nodes.erase(out_node);
+                    VLOG(10)
+                        << "erase node: " << out_node << " from mutable set";
                   }
                 }
                 return !can_be_transformed;
@@ -380,8 +385,8 @@ struct FlowGraph {
         continue;
       }
 
-      VLOG(10) << "add node to nhwc set: " << node;
-      nhwc_nodes.insert(node);
+      VLOG(10) << "add node to mutable set: " << node;
+      mutable_nodes.insert(node);
 
       VLOG(10) << "processing node successor: " << node;
 
@@ -403,7 +408,7 @@ struct FlowGraph {
         continue;
       }
       is_node_layout_visited.insert(node);
-      if (nhwc_nodes.count(node) == 0) {
+      if (mutable_nodes.count(node) == 0) {
         VLOG(10) << "add node to nchw set: " << node;
         AddEdge(src_node(), node, INF);
       }
@@ -542,7 +547,7 @@ using Edge = FlowGraph::Edge;
 
 class TransferLayoutPass : public pir::Pass {
  public:
-  TransferLayoutPass() : pir::Pass("transfer_layout_pass", 3) {}
+  TransferLayoutPass() : pir::Pass("transfer_layout_pass", 2) {}
 
   bool CanApplyOn(pir::Operation* op) const override {
     if (!op->isa<pir::ModuleOp>()) {
diff --git a/paddle/fluid/pir/transforms/gpu/add_norm_fuse_pass.cc b/paddle/fluid/pir/transforms/gpu/add_norm_fuse_pass.cc
index 35afabe3ad1dc..bea1e68e9c077 100644
--- a/paddle/fluid/pir/transforms/gpu/add_norm_fuse_pass.cc
+++ b/paddle/fluid/pir/transforms/gpu/add_norm_fuse_pass.cc
@@ -141,11 +141,13 @@ class RmsNormFusePattern : public paddle::drr::DrrPatternBase {
 class AddRmsNormFusePattern : public paddle::drr::DrrPatternBase {
  private:
   const bool extra_add_;
+  const bool trans_extra_add_;
 
  public:
-  explicit AddRmsNormFusePattern(bool extra_add) : extra_add_(extra_add) {}
+  AddRmsNormFusePattern(bool extra_add, bool trans_extra_add)
+      : extra_add_(extra_add), trans_extra_add_{trans_extra_add} {}
 
-  uint32_t benefit() const override { return extra_add_ ? 2 : 1; }
+  uint32_t benefit() const override { return extra_add_ ? 4 : 3; }
 
   std::string name() const override { return "AddRmsNormFusePattern"; }
 
@@ -176,7 +178,9 @@ class AddRmsNormFusePattern : public paddle::drr::DrrPatternBase {
     if (extra_add_) {
       const auto &add1 = pat.Op(paddle::dialect::AddOp::name());
       pat.Tensor("add_out1") =
-          add1(pat.Tensor("add_out"), pat.Tensor("any_tensor"));
+          trans_extra_add_
+              ? add1(pat.Tensor("any_tensor"), pat.Tensor("add_out"))
+              : add1(pat.Tensor("add_out"), pat.Tensor("any_tensor"));
     }
     paddle::drr::ResultPattern res = pat.ResultPattern();
     const auto &res_rms_norm =
@@ -207,11 +211,13 @@ class AddRmsNormFusePattern : public paddle::drr::DrrPatternBase {
 class AddLayerNormFusePattern : public paddle::drr::DrrPatternBase {
  private:
   const bool extra_add_;
+  const bool trans_extra_add_;
 
  public:
-  explicit AddLayerNormFusePattern(bool extra_add) : extra_add_(extra_add) {}
+  AddLayerNormFusePattern(bool extra_add, bool trans_extra_add)
+      : extra_add_(extra_add), trans_extra_add_{trans_extra_add} {}
 
-  uint32_t benefit() const override { return extra_add_ ? 2 : 1; }
+  uint32_t benefit() const override { return extra_add_ ? 4 : 3; }
   std::string name() const override { return "AddLayerNormFusePattern"; }
 
   void operator()(paddle::drr::DrrPatternContext *ctx) const override {
@@ -231,22 +237,20 @@ class AddLayerNormFusePattern : public paddle::drr::DrrPatternBase {
     if (extra_add_) {
       const auto &add1 = pat.Op(paddle::dialect::AddOp::name());
       pat.Tensor("add_out1") =
-          add1(pat.Tensor("add_out"), pat.Tensor("any_tensor"));
+          trans_extra_add_
+              ? add1(pat.Tensor("any_tensor"), pat.Tensor("add_out"))
+              : add1(pat.Tensor("add_out"), pat.Tensor("any_tensor"));
     }
 
     paddle::drr::ResultPattern res = pat.ResultPattern();
     const auto &cast_op_dtype = res.ComputeAttr(
         [](const paddle::drr::MatchContext &match_ctx) -> phi::DataType {
-          auto x_dtype = pir::GetDataTypeFromValue(match_ctx.Tensor("x"));
-          return paddle::dialect::TransToPhiDataType(x_dtype);
+          return phi::DataType::FLOAT32;
         });
-    const auto &cast_op_1 =
+    const auto cast_1_op =
         res.Op(paddle::dialect::CastOp::name(), {{"dtype", cast_op_dtype}});
-    res.Tensor("casted_bias") = cast_op_1(res.Tensor("bias"));
-    const auto &cast_op_2 =
+    const auto cast_2_op =
         res.Op(paddle::dialect::CastOp::name(), {{"dtype", cast_op_dtype}});
-    res.Tensor("casted_w") = cast_op_2(res.Tensor("w"));
-
     const auto &fuse_layer_norm =
         res.Op(paddle::dialect::FusedBiasResidualLayernormOp::name(),
                {{"epsilon", pat.Attr("epsilon")},
@@ -256,14 +260,15 @@ class AddLayerNormFusePattern : public paddle::drr::DrrPatternBase {
                 {"quant_round_type", res.Int32Attr(0)},
                 {"quant_max_bound", res.Float32Attr(0.0)},
                 {"quant_min_bound", res.Float32Attr(0.0)}});
-
+    res.Tensor("w_cast") = cast_1_op(res.Tensor("w"));
+    res.Tensor("bias_cast") = cast_1_op(res.Tensor("bias"));
     fuse_layer_norm(
         {
             &res.Tensor("x"),
-            &res.Tensor("casted_bias"),
-            &res.Tensor("residual"),
-            &res.Tensor("casted_w"),
             &res.InputNoneTensor(),
+            &res.Tensor("residual"),
+            &res.Tensor("w_cast"),
+            &res.Tensor("bias_cast"),
         },
         {&res.Tensor("layer_norm_out"),
          &res.Tensor("add_out"),
@@ -272,6 +277,120 @@ class AddLayerNormFusePattern : public paddle::drr::DrrPatternBase {
   }
 };
 
+class AddGroupNormFusePattern : public paddle::drr::DrrPatternBase {
+ private:
+  const bool extra_add_;
+  const bool trans_extra_add_;
+
+ public:
+  AddGroupNormFusePattern(bool extra_add, bool trans_extra_add)
+      : extra_add_(extra_add), trans_extra_add_{trans_extra_add} {}
+
+  uint32_t benefit() const override { return extra_add_ ? 4 : 3; }
+  std::string name() const override { return "AddGroupNormFusePattern"; }
+
+  void operator()(paddle::drr::DrrPatternContext *ctx) const override {
+    paddle::drr::SourcePattern pat = ctx->SourcePattern();
+    const auto &add = pat.Op(paddle::dialect::AddOp::name());
+    const auto &group_norm = pat.Op(paddle::dialect::GroupNormOp::name(),
+                                    {{"epsilon", pat.Attr("epsilon")},
+                                     {"groups", pat.Attr("groups")},
+                                     {"data_format", pat.Attr("data_format")}});
+    pat.Tensor("add_out") = add(pat.Tensor("x"), pat.Tensor("residual"));
+    group_norm(
+        {&pat.Tensor("add_out"), &pat.Tensor("scale"), &pat.Tensor("bias")},
+        {&pat.Tensor("group_out"),
+         &pat.Tensor("mean_out_0"),
+         &pat.Tensor("variance_out_0")});
+    // TODO(bukejiyu) :DRR support matching placeholder op,
+    // the following needs to be deleted
+    if (extra_add_) {
+      const auto &add1 = pat.Op(paddle::dialect::AddOp::name());
+      pat.Tensor("add_out1") =
+          trans_extra_add_
+              ? add1(pat.Tensor("any_tensor"), pat.Tensor("add_out"))
+              : add1(pat.Tensor("add_out"), pat.Tensor("any_tensor"));
+    }
+    pat.AddConstraint([this](const paddle::drr::MatchContext &match_ctx) {
+      auto x_dtype = pir::GetDataTypeFromValue(match_ctx.Tensor("x"));
+      if (!x_dtype.isa<pir::Float16Type>() &&
+          !x_dtype.isa<pir::BFloat16Type>()) {
+        return false;
+      }
+      return true;
+    });
+    paddle::drr::ResultPattern res = pat.ResultPattern();
+    const auto &add_group_norm_silu_op =
+        res.Op(paddle::dialect::AddGroupNormSiluOp::name(),
+               {{"epsilon", pat.Attr("epsilon")},
+                {"groups", pat.Attr("groups")},
+                {"data_format", pat.Attr("data_format")},
+                {"activation", res.StrAttr("")}});
+
+    add_group_norm_silu_op({&res.Tensor("x"),
+                            &res.Tensor("residual"),
+                            &res.Tensor("scale"),
+                            &res.Tensor("bias")},
+                           {&res.Tensor("group_out"),
+                            &res.Tensor("add_out"),
+                            &res.Tensor("mean_out"),
+                            &res.Tensor("variance_out")});
+  }
+};
+
+class AddGroupNormWithActPattern : public paddle::drr::DrrPatternBase {
+ public:
+  uint32_t benefit() const override { return 2; }
+  std::string name() const override { return "AddGroupNormWithActPattern"; }
+
+  void operator()(paddle::drr::DrrPatternContext *ctx) const override {
+    paddle::drr::SourcePattern pat = ctx->SourcePattern();
+    const auto &add_group_norm_silu_op =
+        pat.Op(paddle::dialect::AddGroupNormSiluOp::name(),
+               {{"epsilon", pat.Attr("epsilon")},
+                {"groups", pat.Attr("groups")},
+                {"data_format", pat.Attr("data_format")},
+                {"activation", pat.Attr("activation")}});
+    const auto &silu = pat.Op(paddle::dialect::SiluOp::name());
+    add_group_norm_silu_op({&pat.Tensor("x"),
+                            &pat.Tensor("residual"),
+                            &pat.Tensor("scale"),
+                            &pat.Tensor("bias")},
+                           {&pat.Tensor("group_out"),
+                            &pat.Tensor("add_out"),
+                            &pat.Tensor("mean_out_0"),
+                            &pat.Tensor("variance_out_0")});
+    pat.Tensor("silu_out") = silu(pat.Tensor("group_out"));
+    pat.AddConstraint([this](const paddle::drr::MatchContext &match_ctx) {
+      auto x_dtype = pir::GetDataTypeFromValue(match_ctx.Tensor("x"));
+      if (!x_dtype.isa<pir::Float16Type>() &&
+          !x_dtype.isa<pir::BFloat16Type>()) {
+        return false;
+      }
+      auto activation = match_ctx.Attr<std::string>("activation");
+      if (activation != "") {
+        return false;
+      }
+      return true;
+    });
+    paddle::drr::ResultPattern res = pat.ResultPattern();
+    const auto &res_add_group_norm_silu_op =
+        res.Op(paddle::dialect::AddGroupNormSiluOp::name(),
+               {{"epsilon", pat.Attr("epsilon")},
+                {"groups", pat.Attr("groups")},
+                {"data_format", pat.Attr("data_format")},
+                {"activation", res.StrAttr("silu")}});
+    res_add_group_norm_silu_op({&res.Tensor("x"),
+                                &res.Tensor("residual"),
+                                &res.Tensor("scale"),
+                                &res.Tensor("bias")},
+                               {&res.Tensor("silu_out"),
+                                &res.Tensor("add_out"),
+                                &res.Tensor("mean_out"),
+                                &res.Tensor("variance_out")});
+  }
+};
+
 class AddNormFusePass : public pir::PatternRewritePass {
  public:
   AddNormFusePass() : pir::PatternRewritePass("add_norm_fuse_pass", 2) {}
@@ -290,13 +409,36 @@ class AddNormFusePass : public pir::PatternRewritePass {
     // x--------
     //           add-rms_norm ---> rms_norm
     // residual-
-    ps.Add(paddle::drr::Create<AddRmsNormFusePattern>(context, !extra_add));
-    ps.Add(paddle::drr::Create<AddRmsNormFusePattern>(context, extra_add));
+    ps.Add(
+        paddle::drr::Create<AddRmsNormFusePattern>(context, !extra_add, false));
+    ps.Add(
+        paddle::drr::Create<AddRmsNormFusePattern>(context, extra_add, true));
+    ps.Add(
+        paddle::drr::Create<AddRmsNormFusePattern>(context, extra_add, false));
+
     // x--------
     //           add-layer_norm ----> fused_bias_residual_layernorm
     // residual-
-    ps.Add(paddle::drr::Create<AddLayerNormFusePattern>(context, !extra_add));
-    ps.Add(paddle::drr::Create<AddLayerNormFusePattern>(context, extra_add));
+    ps.Add(paddle::drr::Create<AddLayerNormFusePattern>(
+        context, !extra_add, false));
+    ps.Add(
+        paddle::drr::Create<AddLayerNormFusePattern>(context, extra_add, true));
+    ps.Add(paddle::drr::Create<AddLayerNormFusePattern>(
+        context, extra_add, false));
+
+    // x--------
+    //           add-group_norm ----> add_group_norm_silu
+    // residual-
+    ps.Add(paddle::drr::Create<AddGroupNormFusePattern>(
+        context, !extra_add, true));
+    ps.Add(
+        paddle::drr::Create<AddGroupNormFusePattern>(context, extra_add, true));
+    ps.Add(paddle::drr::Create<AddGroupNormFusePattern>(
+        context, extra_add, false));
+
+    // add_group_norm_silu-silu --->add_group_norm_silu
+    ps.Add(paddle::drr::Create<AddGroupNormWithActPattern>(context));
+    // group-silu->add_group_norm_silu moved to group_norm_silu_fuse_pass
     return ps;
   }
 };
diff --git a/paddle/fluid/pir/transforms/gpu/conv2d_add_act_fuse_pass.cc b/paddle/fluid/pir/transforms/gpu/conv2d_add_act_fuse_pass.cc
index 96851cfeac559..754422312e47a 100644
--- a/paddle/fluid/pir/transforms/gpu/conv2d_add_act_fuse_pass.cc
+++ b/paddle/fluid/pir/transforms/gpu/conv2d_add_act_fuse_pass.cc
@@ -35,8 +35,8 @@ class Conv2dAddActFusePassDrrPattern : public paddle::drr::DrrPatternBase {
  private:
   std::string act_name_;
   bool cutlass_pattern_;
-  const std::unordered_set<std::string> conv2d_depthwise_act_set_ = {
-      "relu", "swish", "sigmoid"};
+  const std::unordered_set<std::string> conv2d_depthwise_act_set_ = {"relu",
+                                                                     "swish"};
 
  public:
   static const int CUTLASS_NHWC_ALIGNMENT = 8;
@@ -152,62 +152,6 @@ class Conv2dAddActFusePassDrrPattern : public paddle::drr::DrrPatternBase {
         [this](const paddle::drr::MatchContext &match_ctx) -> std::string {
           return cutlass_pattern_ ? "gpu" : "gpudnn";
         });
-    const auto &perm_weight_shape = res.ComputeAttr(
-        [this](const paddle::drr::MatchContext &match_ctx) -> std::vector<int> {
-          auto data_format = match_ctx.Attr<std::string>("data_format");
-          if (cutlass_pattern_ || data_format == "NHWC") {
-            return {0, 2, 3, 1};
-          } else {
-            return {0, 1, 2, 3};
-          }
-        });
-    const auto &perm_input_shape = res.ComputeAttr(
-        [this](const paddle::drr::MatchContext &match_ctx) -> std::vector<int> {
-          auto data_format = match_ctx.Attr<std::string>("data_format");
-          if (cutlass_pattern_ && data_format == "NCHW") {
-            return {0, 2, 3, 1};
-          } else {
-            return {0, 1, 2, 3};
-          }
-        });
-    const auto &perm_bias_shape = res.ComputeAttr(
-        [this](const paddle::drr::MatchContext &match_ctx) -> std::vector<int> {
-          auto data_format = match_ctx.Attr<std::string>("data_format");
-          auto bias_shape = pir::GetShapeFromValue(match_ctx.Tensor("bias"));
-          if (cutlass_pattern_ && data_format == "NCHW") {
-            if (bias_shape.size() == 4) {
-              return {0, 2, 3, 1};
-            } else if (bias_shape.size() == 3) {
-              return {0, 2, 1};
-            } else {
-              return {0};
-            }
-          } else {
-            std::vector<int> dst_vector(bias_shape.size());
-            std::iota(dst_vector.begin(), dst_vector.end(), 0);
-            return dst_vector;
-          }
-        });
-    const auto &data_format_conv = res.ComputeAttr(
-        [this](const paddle::drr::MatchContext &match_ctx) -> std::string {
-          auto data_format = match_ctx.Attr<std::string>("data_format");
-          if (cutlass_pattern_ && data_format == "NCHW") {
-            return "NHWC";
-          } else {
-            return data_format;
-          }
-        });
-    // TODO(bukejiyu) When the transfer_layout_pass is supported,
-    // transpose_op will be deleted.
-    const auto &transpose_op_w = res.Op(paddle::dialect::TransposeOp::name(),
-                                        {{"perm", perm_weight_shape}});
-    const auto &transpose_op_input = res.Op(
-        paddle::dialect::TransposeOp::name(), {{"perm", perm_input_shape}});
-    const auto &transpose_op_bias = res.Op(paddle::dialect::TransposeOp::name(),
-                                           {{"perm", perm_bias_shape}});
-    res.Tensor("filter_transpose") = transpose_op_w(res.Tensor("filter"));
-    res.Tensor("input_transpose") = transpose_op_input(res.Tensor("input"));
-    res.Tensor("bias_transpose") = transpose_op_bias(res.Tensor("bias"));
     const auto &fused_conv2d_add_act = res.Op(
         paddle::dialect::FusedConv2dAddActOp::name(),
         {{
@@ -216,7 +160,7 @@ class Conv2dAddActFusePassDrrPattern : public paddle::drr::DrrPatternBase {
             {"padding_algorithm", pat.Attr("padding_algorithm")},
             {"dilations", pat.Attr("dilations")},
             {"groups", pat.Attr("groups")},
-            {"data_format", data_format_conv},
+            {"data_format", pat.Attr("data_format")},
             {"activation", res.StrAttr(act_name_)},
             {"split_channels", res.VectorInt32Attr({})},
             {"exhaustive_search", res.BoolAttr(false)},
@@ -224,24 +168,11 @@ class Conv2dAddActFusePassDrrPattern : public paddle::drr::DrrPatternBase {
             {"fuse_alpha", res.Float32Attr(0.0f)},
         }},
         {{{paddle::dialect::kForceBackendAttr, force_backend_runtime_attr}}});
-    fused_conv2d_add_act({&res.Tensor("input_transpose"),
-                          &res.Tensor("filter_transpose"),
-                          &res.Tensor("bias_transpose"),
+    fused_conv2d_add_act({&res.Tensor("input"),
+                          &res.Tensor("filter"),
+                          &res.Tensor("bias"),
                           &res.InputNoneTensor()},
-                         {&res.Tensor("fuesd_conv2d_add_act_out")});
-    const auto &perm_out_shape = res.ComputeAttr(
-        [this](const paddle::drr::MatchContext &match_ctx) -> std::vector<int> {
-          auto data_format = match_ctx.Attr<std::string>("data_format");
-          if (cutlass_pattern_ && data_format == "NCHW") {
-            return {0, 3, 1, 2};
-          } else {
-            return {0, 1, 2, 3};
-          }
-        });
-    const auto &transpose_op_out = res.Op(paddle::dialect::TransposeOp::name(),
-                                          {{"perm", perm_out_shape}});
-    res.Tensor("act_out") =
-        transpose_op_out(res.Tensor("fuesd_conv2d_add_act_out"));
+                         {&res.Tensor("act_out")});
   }
 };
 
@@ -278,11 +209,9 @@ class Conv2dAdd2ActFusePattern
     if (next_op->isa<paddle::dialect::ReluOp>()) {
       act_name = "relu";
     }
-#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 8000 && CUDNN_VERSION < 8700
+#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION >= 8000 && CUDNN_VERSION < 8700
     if (next_op->isa<paddle::dialect::TanhOp>()) {
       act_name = "tanh";
-    } else if (next_op->isa<paddle::dialect::SigmoidOp>()) {
-      act_name = "sigmoid";
     }
 #endif
     if (act_name == "") {
@@ -346,11 +275,10 @@ class Conv2dAddActFusePass : public pir::PatternRewritePass {
                 paddle::dialect::FusedConv2dAddActOp::name()});
 
 // NOTE(liuyuanle): cudnn [8.7, 8.9 now) version has bug when act is
-// sigmoid/tanh. Ref to issue
+// tanh. Ref to issue
 // https://github.com/PaddlePaddle/Paddle/issues/50853
 #if CUDNN_VERSION >= 8000 && CUDNN_VERSION < 8700
-    const std::unordered_set<std::string> cudnn_act_set(
-        {"relu", "sigmoid", "tanh"});
+    const std::unordered_set<std::string> cudnn_act_set({"relu", "tanh"});
 #else
     const std::unordered_set<std::string> cudnn_act_set({"relu"});
 #endif
diff --git a/paddle/fluid/pir/transforms/gpu/conv2d_add_fuse_pass.cc b/paddle/fluid/pir/transforms/gpu/conv2d_add_fuse_pass.cc
index 994fbdf2ce69f..89a023197a27e 100644
--- a/paddle/fluid/pir/transforms/gpu/conv2d_add_fuse_pass.cc
+++ b/paddle/fluid/pir/transforms/gpu/conv2d_add_fuse_pass.cc
@@ -138,62 +138,6 @@ class Conv2dAddFusePattern : public paddle::drr::DrrPatternBase {
         [this](const paddle::drr::MatchContext &match_ctx) -> std::string {
           return cutlass_pattern_ ? "gpu" : "gpudnn";
         });
-    const auto &perm_weight_shape = res.ComputeAttr(
-        [this](const paddle::drr::MatchContext &match_ctx) -> std::vector<int> {
-          auto data_format = match_ctx.Attr<std::string>("data_format");
-          if (cutlass_pattern_ || data_format == "NHWC") {
-            return {0, 2, 3, 1};
-          } else {
-            return {0, 1, 2, 3};
-          }
-        });
-    const auto &perm_input_shape = res.ComputeAttr(
-        [this](const paddle::drr::MatchContext &match_ctx) -> std::vector<int> {
-          auto data_format = match_ctx.Attr<std::string>("data_format");
-          if (cutlass_pattern_ && data_format == "NCHW") {
-            return {0, 2, 3, 1};
-          } else {
-            return {0, 1, 2, 3};
-          }
-        });
-    const auto &perm_bias_shape = res.ComputeAttr(
-        [this](const paddle::drr::MatchContext &match_ctx) -> std::vector<int> {
-          auto data_format = match_ctx.Attr<std::string>("data_format");
-          auto bias_shape = pir::GetShapeFromValue(match_ctx.Tensor("bias"));
-          if (cutlass_pattern_ && data_format == "NCHW") {
-            if (bias_shape.size() == 4) {
-              return {0, 2, 3, 1};
-            } else if (bias_shape.size() == 3) {
-              return {0, 2, 1};
-            } else {
-              return {0};
-            }
-          } else {
-            std::vector<int> dst_vector(bias_shape.size());
-            std::iota(dst_vector.begin(), dst_vector.end(), 0);
-            return dst_vector;
-          }
-        });
-    const auto &data_format_conv = res.ComputeAttr(
-        [this](const paddle::drr::MatchContext &match_ctx) -> std::string {
-          auto data_format = match_ctx.Attr<std::string>("data_format");
-          if (cutlass_pattern_ && data_format == "NCHW") {
-            return "NHWC";
-          } else {
-            return data_format;
-          }
-        });
-    // TODO(bukejiyu) When the transfer_layout_pass is supported,
-    // transpose_op will be deleted.
-    const auto &transpose_op_w = res.Op(paddle::dialect::TransposeOp::name(),
-                                        {{"perm", perm_weight_shape}});
-    const auto &transpose_op_input = res.Op(
-        paddle::dialect::TransposeOp::name(), {{"perm", perm_input_shape}});
-    const auto &transpose_op_bias = res.Op(paddle::dialect::TransposeOp::name(),
-                                           {{"perm", perm_bias_shape}});
-    res.Tensor("filter_transpose") = transpose_op_w(res.Tensor("filter"));
-    res.Tensor("input_transpose") = transpose_op_input(res.Tensor("input"));
-    res.Tensor("bias_transpose") = transpose_op_bias(res.Tensor("bias"));
     const auto &fused_conv2d_add_act = res.Op(
         paddle::dialect::FusedConv2dAddActOp::name(),
         {{
@@ -202,7 +146,7 @@ class Conv2dAddFusePattern : public paddle::drr::DrrPatternBase {
             {"padding_algorithm", pat.Attr("padding_algorithm")},
             {"dilations", pat.Attr("dilations")},
             {"groups", pat.Attr("groups")},
-            {"data_format", data_format_conv},
+            {"data_format", pat.Attr("data_format")},
             {"activation", res.StrAttr("identity")},
             {"split_channels", res.VectorInt32Attr({})},
             {"exhaustive_search", res.BoolAttr(false)},
@@ -211,25 +155,11 @@ class Conv2dAddFusePattern : public paddle::drr::DrrPatternBase {
         }},
         {{{paddle::dialect::kForceBackendAttr, force_backend_runtime_attr}}});
 
-    fused_conv2d_add_act(
-        {&res.Tensor("input_transpose"),
-         &res.Tensor("filter_transpose"),
-         &res.Tensor("bias_transpose"),
-         &res.InputNoneTensor()},
-        {&res.Tensor("fuesd_conv2d_add_act_out"), &res.OutputNoneTensor()});
-    const auto &perm_out_shape = res.ComputeAttr(
-        [this](const paddle::drr::MatchContext &match_ctx) -> std::vector<int> {
-          auto data_format = match_ctx.Attr<std::string>("data_format");
-          if (cutlass_pattern_ && data_format == "NCHW") {
-            return {0, 3, 1, 2};
-          } else {
-            return {0, 1, 2, 3};
-          }
-        });
-    const auto &transpose_op_out = res.Op(paddle::dialect::TransposeOp::name(),
-                                          {{"perm", perm_out_shape}});
-    res.Tensor("add_out") =
-        transpose_op_out(res.Tensor("fuesd_conv2d_add_act_out"));
+    fused_conv2d_add_act({&res.Tensor("input"),
+                          &res.Tensor("filter"),
+                          &res.Tensor("bias"),
+                          &res.InputNoneTensor()},
+                         {&res.Tensor("add_out"), &res.OutputNoneTensor()});
   }
 };
 
diff --git a/paddle/fluid/pir/transforms/gpu/matmul_add_act_fuse_pass.cc b/paddle/fluid/pir/transforms/gpu/matmul_add_act_fuse_pass.cc
index 0da1499a730c5..ecb450201a787 100644
--- a/paddle/fluid/pir/transforms/gpu/matmul_add_act_fuse_pass.cc
+++ b/paddle/fluid/pir/transforms/gpu/matmul_add_act_fuse_pass.cc
@@ -60,11 +60,17 @@ class MatmulAddPattern : public paddle::drr::DrrPatternBase {
 
     pat.AddConstraint([&](const paddle::drr::MatchContext &match_ctx) {
       auto w_dtype = pir::GetDataTypeFromValue(match_ctx.Tensor("w"));
-      if (!w_dtype.isa<pir::Float16Type>() &&
-          !w_dtype.isa<pir::BFloat16Type>() &&
-          !w_dtype.isa<pir::Float32Type>() &&
-          !w_dtype.isa<pir::Float64Type>()) {
-        return false;
+      if (fused_op_name_ == paddle::dialect::GemmEpilogueOp::name()) {
+        if (!w_dtype.isa<pir::Float16Type>() &&
+            !w_dtype.isa<pir::BFloat16Type>()) {
+          return false;
+        }
+      } else {
+        if (!w_dtype.isa<pir::Float16Type>() &&
+            !w_dtype.isa<pir::Float32Type>() &&
+            !w_dtype.isa<pir::Float64Type>()) {
+          return false;
+        }
       }
       auto w_dims = pir::GetShapeFromValue(match_ctx.Tensor("w"));
       auto x_dims = pir::GetShapeFromValue(match_ctx.Tensor("x"));
diff --git a/paddle/fluid/pir/transforms/onednn/onednn_placement_pass.cc b/paddle/fluid/pir/transforms/onednn/onednn_placement_pass.cc
index 38ee474c6352a..cfae35b765941 100644
--- a/paddle/fluid/pir/transforms/onednn/onednn_placement_pass.cc
+++ b/paddle/fluid/pir/transforms/onednn/onednn_placement_pass.cc
@@ -53,6 +53,9 @@ class OneDNNPlacementPattern : public pir::OpRewritePattern<OpType> {
       for (auto &attr : runtime_info.extra_args_default_value) {
         attributes[attr.first] = attr.second;
       }
+      if (attributes.find("is_test") != attributes.end()) {
+        attributes["is_test"] = rewriter.bool_attr(true);
+      }
 
       pir::Operation *op_item_inner = rewriter.Build(op->operands_source(),
                                                      attributes,
diff --git a/paddle/fluid/pir/transforms/passes.h b/paddle/fluid/pir/transforms/passes.h
index db6a50a8ec3ad..01a122f2de6cc 100644
--- a/paddle/fluid/pir/transforms/passes.h
+++ b/paddle/fluid/pir/transforms/passes.h
@@ -38,6 +38,7 @@ USE_PIR_PASS(conv2d_add_fuse_pass);
 USE_PIR_PASS(conv2d_add_act_fuse_pass);
 USE_PIR_PASS(embedding_eltwise_layernorm_fuse_pass);
 USE_PIR_PASS(add_norm_fuse_pass);
+USE_PIR_PASS(group_norm_silu_fuse_pass);
 USE_PIR_PASS(fused_dot_product_attention_pass);
 USE_PIR_PASS(fused_flash_attn_pass);
 USE_PIR_PASS(remove_redundant_transpose_pass);
@@ -75,7 +76,6 @@ USE_PIR_PASS(onednn_placement_pass);
 
 #ifdef PADDLE_WITH_XPU
 USE_PIR_PASS(add_layernorm_xpu_fuse_pass);
-USE_PIR_PASS(group_norm_silu_xpu_fuse_pass);
 USE_PIR_PASS(conv2d_bn_xpu_fuse_pass);
 #endif
 
diff --git a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
index feac98cc91f05..bf3df2a9623c5 100644
--- a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
+++ b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
@@ -62,8 +62,7 @@ COMMON_DECLARE_bool(use_mkldnn);
 COMMON_DECLARE_bool(print_ir);
 // COMMON_DECLARE_string(pir_onednn_kernel_blacklist);
 
-namespace paddle {
-namespace dialect {
+namespace paddle::dialect {
 
 pir::Type ConvertOpTypeToKernelType(pir::IrContext* ctx,
                                     pir::Type op_type,
@@ -3189,5 +3188,4 @@ std::unique_ptr<pir::Program> PdOpLowerToKernelPass(pir::Program* prog,
 
   return program;
 }
-}  // namespace dialect
-}  // namespace paddle
+}  // namespace paddle::dialect
diff --git a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc
index 9d522d8b2f0fe..0fe9e7efbfec2 100644
--- a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc
+++ b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc
@@ -22,8 +22,7 @@
 PD_DECLARE_bool(use_stream_safe_cuda_allocator);
 COMMON_DECLARE_bool(new_executor_use_cuda_graph);
 
-namespace paddle {
-namespace platform {
+namespace paddle::platform {
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 void InitCUDNNRelatedHandle(phi::GPUContext* dev_ctx) {
@@ -186,5 +185,4 @@ std::unique_ptr<CUDAGraph> EndCUDAGraphCapture() {
 }
 #endif
 
-}  // namespace platform
-}  // namespace paddle
+}  // namespace paddle::platform
diff --git a/paddle/fluid/platform/denormal.cc b/paddle/fluid/platform/denormal.cc
index d6a0e749f93c8..3aa52da56f05b 100644
--- a/paddle/fluid/platform/denormal.cc
+++ b/paddle/fluid/platform/denormal.cc
@@ -38,8 +38,7 @@
 #include <pmmintrin.h>
 #endif
 
-namespace paddle {
-namespace platform {
+namespace paddle::platform {
 
 static void SetDenormalState(bool flush_zero_mode, bool denormals_zero_mode) {
 #ifdef DENORM_USE_INTRINSICS
@@ -80,5 +79,4 @@ ScopedRestoreFlushDenormalState::~ScopedRestoreFlushDenormalState() {
 }
 
 ScopedFlushDenormal::ScopedFlushDenormal() { SetDenormalState(true, true); }
-}  // namespace platform
-}  // namespace paddle
+}  // namespace paddle::platform
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc
index 358d52d03d31b..d7b164862cd7e 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -68,8 +68,7 @@ PADDLE_DEFINE_EXPORTED_uint64(cuda_memory_async_pool_realease_threshold,
                               "Amount of reserved memory in bytes to hold onto "
                               "before trying to release memory back to the OS");
 
-namespace paddle {
-namespace platform {
+namespace paddle::platform {
 
 void GpuMemoryUsage(size_t *available, size_t *total) {
   size_t actual_available, actual_total;
@@ -719,5 +718,4 @@ void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) {
   phi::backends::gpu::GpuMemsetAsync(dst, value, count, stream);
 }
 
-}  // namespace platform
-}  // namespace paddle
+}  // namespace paddle::platform
diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h
index 496b253dff5b3..980b7cb35410b 100644
--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
@@ -80,8 +80,14 @@ namespace dynload {
   __macro(cublasSgetriBatched);           \
   __macro(cublasDgetrfBatched);           \
   __macro(cublasDgetriBatched);           \
+  __macro(cublasCgetrfBatched);           \
+  __macro(cublasCgetriBatched);           \
+  __macro(cublasZgetrfBatched);           \
+  __macro(cublasZgetriBatched);           \
   __macro(cublasSmatinvBatched);          \
   __macro(cublasDmatinvBatched);          \
+  __macro(cublasCmatinvBatched);          \
+  __macro(cublasZmatinvBatched);          \
   __macro(cublasSgetrsBatched);           \
   __macro(cublasDgetrsBatched);
 
diff --git a/paddle/fluid/platform/dynload/cudnn.cc b/paddle/fluid/platform/dynload/cudnn.cc
index aa8fd62aa85cc..21a45648fba63 100644
--- a/paddle/fluid/platform/dynload/cudnn.cc
+++ b/paddle/fluid/platform/dynload/cudnn.cc
@@ -16,9 +16,7 @@ limitations under the License. */
 
 #include "paddle/phi/backends/dynload/cudnn.h"
 
-namespace paddle {
-namespace platform {
-namespace dynload {
+namespace paddle::platform::dynload {
 
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
@@ -58,6 +56,4 @@ CUDNN_DNN_ROUTINE_EACH_R9(DEFINE_WRAP);
 
 bool HasCUDNN() { return phi::dynload::HasCUDNN(); }
 
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
+}  // namespace paddle::platform::dynload
diff --git a/paddle/fluid/platform/dynload/curand.cc b/paddle/fluid/platform/dynload/curand.cc
index 9a6686515ea2b..b2aaff1d15427 100644
--- a/paddle/fluid/platform/dynload/curand.cc
+++ b/paddle/fluid/platform/dynload/curand.cc
@@ -14,14 +14,10 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/dynload/curand.h"
 
-namespace paddle {
-namespace platform {
-namespace dynload {
+namespace paddle::platform::dynload {
 
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
 CURAND_RAND_ROUTINE_EACH(DEFINE_WRAP);
 
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
+}  // namespace paddle::platform::dynload
diff --git a/paddle/fluid/platform/dynload/cusolver.cc b/paddle/fluid/platform/dynload/cusolver.cc
index bf8394f3f02ca..1085aaebe052c 100644
--- a/paddle/fluid/platform/dynload/cusolver.cc
+++ b/paddle/fluid/platform/dynload/cusolver.cc
@@ -14,9 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/dynload/cusolver.h"
 
-namespace paddle {
-namespace platform {
-namespace dynload {
+namespace paddle::platform::dynload {
 
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
@@ -30,6 +28,4 @@ CUSOLVER_ROUTINE_EACH_R1(DEFINE_WRAP);
 CUSOLVER_ROUTINE_EACH_R2(DEFINE_WRAP);
 #endif
 
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
+}  // namespace paddle::platform::dynload
diff --git a/paddle/fluid/platform/dynload/nccl.cc b/paddle/fluid/platform/dynload/nccl.cc
index 7b0ea3bb7f3c1..ee270918b59c7 100644
--- a/paddle/fluid/platform/dynload/nccl.cc
+++ b/paddle/fluid/platform/dynload/nccl.cc
@@ -14,9 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/dynload/nccl.h"
 
-namespace paddle {
-namespace platform {
-namespace dynload {
+namespace paddle::platform::dynload {
 
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
@@ -38,6 +36,4 @@ NCCL_RAND_ROUTINE_EACH_AFTER_2703(DEFINE_WRAP)
 NCCL_RAND_ROUTINE_EACH_AFTER_21100(DEFINE_WRAP)
 #endif
 
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
+}  // namespace paddle::platform::dynload
diff --git a/paddle/fluid/platform/dynload/nvrtc.cc b/paddle/fluid/platform/dynload/nvrtc.cc
index 242aa912ad838..b157c8c239ca5 100644
--- a/paddle/fluid/platform/dynload/nvrtc.cc
+++ b/paddle/fluid/platform/dynload/nvrtc.cc
@@ -16,9 +16,7 @@ limitations under the License. */
 
 #include "paddle/phi/backends/dynload/nvrtc.h"
 
-namespace paddle {
-namespace platform {
-namespace dynload {
+namespace paddle::platform::dynload {
 
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
@@ -26,6 +24,4 @@ NVRTC_ROUTINE_EACH(DEFINE_WRAP);
 
 bool HasNVRTC() { return phi::dynload::HasNVRTC(); }
 
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
+}  // namespace paddle::platform::dynload
diff --git a/paddle/fluid/platform/dynload/warpctc.cc b/paddle/fluid/platform/dynload/warpctc.cc
index 48c78a130732e..0861ffc7a0c33 100644
--- a/paddle/fluid/platform/dynload/warpctc.cc
+++ b/paddle/fluid/platform/dynload/warpctc.cc
@@ -14,14 +14,10 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/dynload/warpctc.h"
 
-namespace paddle {
-namespace platform {
-namespace dynload {
+namespace paddle::platform::dynload {
 
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
 WARPCTC_ROUTINE_EACH(DEFINE_WRAP);
 
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
+}  // namespace paddle::platform::dynload
diff --git a/paddle/fluid/platform/monitor.cc b/paddle/fluid/platform/monitor.cc
index 40e0e226779b8..596667e2fa782 100644
--- a/paddle/fluid/platform/monitor.cc
+++ b/paddle/fluid/platform/monitor.cc
@@ -14,10 +14,6 @@
 
 #include "paddle/fluid/platform/monitor.h"
 
-namespace paddle {
-namespace platform {}  // namespace platform
-}  // namespace paddle
-
 DEFINE_INT_STATUS(STAT_total_feasign_num_in_mem)
 DEFINE_INT_STATUS(STAT_epoch_finish)
 DEFINE_INT_STATUS(STAT_gpu0_mem_size)
diff --git a/paddle/fluid/platform/profiler/cuda_tracer.cc b/paddle/fluid/platform/profiler/cuda_tracer.cc
index a462521db5144..ba559f24abfc8 100644
--- a/paddle/fluid/platform/profiler/cuda_tracer.cc
+++ b/paddle/fluid/platform/profiler/cuda_tracer.cc
@@ -33,10 +33,7 @@
     }                                                                        \
   } while (0)
 
-namespace paddle {
-namespace platform {
-
-namespace details {
+namespace paddle::platform::details {
 std::unordered_map<uint32_t, uint64_t> CreateThreadIdMapping() {
   std::unordered_map<uint32_t, uint64_t> mapping;
   std::unordered_map<uint64_t, ThreadId> ids = GetAllThreadIds();
@@ -45,7 +42,8 @@ std::unordered_map<uint32_t, uint64_t> CreateThreadIdMapping() {
   }
   return mapping;
 }
-}  // namespace details
+}  // namespace paddle::platform::details
+namespace paddle::platform {
 
 CudaTracer::CudaTracer() = default;
 
@@ -194,5 +192,4 @@ void CudaTracer::ReleaseBuffer(uint8_t* buffer) {
   paddle::framework::AlignedFree(buffer);
 }
 
-}  // namespace platform
-}  // namespace paddle
+}  // namespace paddle::platform
diff --git a/paddle/fluid/platform/profiler/event_python.cc b/paddle/fluid/platform/profiler/event_python.cc
index 8dd865e14e1c9..91ec92fe80b9b 100644
--- a/paddle/fluid/platform/profiler/event_python.cc
+++ b/paddle/fluid/platform/profiler/event_python.cc
@@ -16,8 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler/dump/serialization_logger.h"
 #include "paddle/fluid/platform/profiler/extra_info.h"
 
-namespace paddle {
-namespace platform {
+namespace paddle::platform {
 
 HostPythonNode::~HostPythonNode() {
   // delete all runtime or device nodes and recursive delete children
@@ -192,5 +191,4 @@ std::unique_ptr<ProfilerResult> LoadProfilerResult(std::string filename) {
   return result;
 }
 
-}  // namespace platform
-}  // namespace paddle
+}  // namespace paddle::platform
diff --git a/paddle/fluid/platform/timer.cc b/paddle/fluid/platform/timer.cc
index 855a3d47e38bb..7dc854e71bd02 100644
--- a/paddle/fluid/platform/timer.cc
+++ b/paddle/fluid/platform/timer.cc
@@ -14,8 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/timer.h"
 
-namespace paddle {
-namespace platform {
+namespace paddle::platform {
 
 void Timer::Reset() {
   _start.tv_sec = 0;
@@ -59,5 +58,4 @@ int64_t Timer::Tickus() {
          (_now.tv_usec - _start.tv_usec);
 }
 
-}  // namespace platform
-}  // namespace paddle
+}  // namespace paddle::platform
diff --git a/paddle/fluid/prim/api/all.cc b/paddle/fluid/prim/api/all.cc
index 85e1718ec982a..8ecba4f542415 100644
--- a/paddle/fluid/prim/api/all.cc
+++ b/paddle/fluid/prim/api/all.cc
@@ -13,7 +13,3 @@
 // limitations under the License.
 
 #include "paddle/fluid/prim/api/all.h"
-
-namespace paddle {
-namespace prim {}  // namespace prim
-}  // namespace paddle
diff --git a/paddle/fluid/prim/api/api.yaml b/paddle/fluid/prim/api/api.yaml
index a951ed4431a57..61e056678d19f 100644
--- a/paddle/fluid/prim/api/api.yaml
+++ b/paddle/fluid/prim/api/api.yaml
@@ -38,6 +38,7 @@
 - pad
 - sqrt
 - cumsum
+- cumprod
 - put_along_axis
 - sin
 - cos
diff --git a/paddle/fluid/prim/api/auto_code_generated/template/static_prim_api.cc.j2 b/paddle/fluid/prim/api/auto_code_generated/template/static_prim_api.cc.j2
index 55b65bf05163f..b1b675a78589a 100644
--- a/paddle/fluid/prim/api/auto_code_generated/template/static_prim_api.cc.j2
+++ b/paddle/fluid/prim/api/auto_code_generated/template/static_prim_api.cc.j2
@@ -1,5 +1,5 @@
 {% from "utils.cc.j2" import static_prim_api %}
-// Generated by /paddle/fluid/prim/api/auto_code_generated/static_gen.py.  
+// Generated by /paddle/fluid/prim/api/auto_code_generated/static_gen.py.
 // DO NOT EDIT!
 
 #include <string.h>
diff --git a/paddle/fluid/prim/api/auto_code_generated/template/utils.cc.j2 b/paddle/fluid/prim/api/auto_code_generated/template/utils.cc.j2
index 78a270ef37d5b..5e34af02f2857 100644
--- a/paddle/fluid/prim/api/auto_code_generated/template/utils.cc.j2
+++ b/paddle/fluid/prim/api/auto_code_generated/template/utils.cc.j2
@@ -25,7 +25,7 @@
   {% endfilter %}
   op->CheckAttrs();
   op->InferVarType(block);
-  op->InferShape(*block); 
+  op->InferShape(*block);
   {% if outputs|length > 1 %}
   return std::make_tuple{{sequence('(', ')', ', ', output_names)}};
   {% elif outputs|length == 1 %}
@@ -56,7 +56,7 @@ template <>
 {%- macro static_prim_api_sig_ret(outputs) -%}
   {%- set names = [] -%}
   {%- for i in outputs -%} {%- do names.append(i.typename|to_paddle_output_type) -%} {%- endfor -%}
-  {%- if names|length > 1 -%} 
+  {%- if names|length > 1 -%}
 std::tuple<{{sequence('', '', ', ', names)}}>
   {%- else -%}
 {{names[0]}}
@@ -80,7 +80,7 @@ if ({{input.name}}) {
   std::transform({{input.name}}.get().begin(), {{input.name}}.get().end(), {{input.name}}_names.begin(), [](const Tensor& t) {
     return std::static_pointer_cast<prim::DescTensor>(t.impl())->Name();
   });
-  op->SetInput("{{input.fluid_name | to_pascal}}", {{input.name}}_names);  
+  op->SetInput("{{input.fluid_name | to_pascal}}", {{input.name}}_names);
 }
   {%- else -%}
 if ({{input.name}}) {
@@ -96,7 +96,7 @@ std::vector<std::string> {{input.name}}_names({{input.name}}.size());;
 std::transform({{input.name}}.begin(), {{input.name}}.end(), {{input.name}}_names.begin(), [](const Tensor& t) {
   return std::static_pointer_cast<prim::DescTensor>(t.impl())->Name();
 });
-op->SetInput("{{input.fluid_name | to_pascal}}", {{input.name}}_names);  
+op->SetInput("{{input.fluid_name | to_pascal}}", {{input.name}}_names);
   {%- else -%}
 op->SetInput("{{input.fluid_name | to_pascal}}", {std::static_pointer_cast<prim::DescTensor>({{input.name}}.impl())->Name()});
   {%- endif -%}
@@ -180,7 +180,7 @@ paddle::framework::TransToProtoVarType({{src_name}})
   {%- set is_set = [] -%}  {#- why not use boolean, ref: https://stackoverflow.com/questions/17925674/jinja2-local-global-variable -#}
   {%- if not is_set -%} {#- use DataType attr as default output dtype -#}
     {%- for attr in attrs -%}
-      {%- if attr.typename is datatype -%} 
+      {%- if attr.typename is datatype -%}
 {{attr.name}}
         {%- do is_set.append(1) -%}
       {%- endif -%}
diff --git a/paddle/fluid/prim/api/composite_backward/composite_backward_api.h b/paddle/fluid/prim/api/composite_backward/composite_backward_api.h
index 0465f73a44593..17bc345917064 100644
--- a/paddle/fluid/prim/api/composite_backward/composite_backward_api.h
+++ b/paddle/fluid/prim/api/composite_backward/composite_backward_api.h
@@ -744,13 +744,20 @@ void slice_grad(const Tensor& input,
       paddings.push_back(offsets[i]);
       paddings.push_back((in_dims[i] - out_dims[i]) - offsets[i]);
     }
+    Tensor reshape_out_grad;
+    if (out_grad.shape().size() == 0) {
+      reshape_out_grad = full<T>({1}, 1, input.dtype());
+    } else {
+      reshape_out_grad = out_grad;
+    }
+
     if (decrease_size > 0 &&
         (decrease_size != static_cast<size_t>(in_dims.size()))) {
       auto out_tmp =
-          pad<T>(reshape<T>(out_grad, origin_out_shape), paddings, 0.0);
+          pad<T>(reshape<T>(reshape_out_grad, origin_out_shape), paddings, 0.0);
       set_output<T>(out_tmp, input_grad);
     } else {
-      auto out_tmp = pad<T>(out_grad, paddings, 0.0);
+      auto out_tmp = pad<T>(reshape_out_grad, paddings, 0.0);
       set_output<T>(out_tmp, input_grad);
     }
   }
@@ -1127,11 +1134,13 @@ void prod_grad(const Tensor& x,
     } else {
       reduce_all = false;
     }
-    auto x_grad_tmp = Tensor();
-    auto out_tmp = Tensor();
+    auto out_grad_tmp = Tensor();
+    auto x_reshape = Tensor();
+    std::vector<int64_t> unchange_axis, change_axis, transpose_shape,
+        cumprod_shape;
+    std::vector<int> transpose_dim, origin_position;
     if (x_dim_size == 1) {
-      x_grad_tmp = out_grad.expand(IntArray(x_dim));
-      out_tmp = out.expand(IntArray(x_dim));
+      out_grad_tmp = out_grad.expand(IntArray(x_dim));
     } else {
       if (!keep_dim) {
         auto axis_ = std::vector<int64_t>();
@@ -1149,16 +1158,69 @@ void prod_grad(const Tensor& x,
         }
         auto out_grad_shape = get_unsqueeze_dims(out_grad, axis_);
         auto out_grad_ = reshape<T>(out_grad, out_grad_shape);
-        x_grad_tmp = out_grad_.expand(IntArray(x_dim));
-        auto out_ = reshape<T>(out, out_grad_shape);
-        out_tmp = out_.expand(IntArray(x_dim));
+        out_grad_tmp = out_grad_.expand(IntArray(x_dim));
       } else {
-        x_grad_tmp = out_grad.expand(IntArray(x_dim));
-        out_tmp = out.expand(IntArray(x_dim));
+        out_grad_tmp = out_grad.expand(IntArray(x_dim));
       }
     }
-    auto x_grad_res = x_grad_tmp * out_tmp * (1 / x);
-    set_output<T>(x_grad_res, x_grad);
+    auto axis_ = std::vector<int64_t>();
+    if (reduce_all) {
+      int64_t numel = 1;
+      for (int64_t i = 0; i < x_dim_size; i++) {
+        axis_.push_back(i);
+        numel *= x_dim[i];
+      }
+      cumprod_shape.push_back(numel);
+      x_reshape = reshape<T>(x, cumprod_shape);
+      auto left_cumprod = cumprod<T>(x_reshape, -1, true, false);
+      auto right_cumprod = cumprod<T>(x_reshape, -1, true, true);
+      auto x_grad_tmp = left_cumprod * right_cumprod;
+      auto x_grad_tmp2 = reshape<T>(x_grad_tmp, x.shape());
+      auto x_grad_res = x_grad_tmp2 * out_grad_tmp;
+      set_output<T>(x_grad_res, x_grad);
+    } else {
+      int64_t unchange_size = x_dim_size - axis_size;
+      int64_t unchange_index = 0;
+      for (int64_t i = 0; i < axis_size; i++) {
+        if (axis[i] < 0) {
+          axis_.push_back(axis[i] + x_dim_size);
+        } else {
+          axis_.push_back(axis[i]);
+        }
+      }
+      for (int64_t i = 0; i < x_dim_size; i++) {
+        auto it = find(axis_.begin(), axis_.end(), i);
+        if (it != axis_.end()) {
+          int64_t index = it - axis_.begin();
+          origin_position.push_back(static_cast<int>(unchange_size + index));
+        } else {
+          unchange_axis.push_back(i);
+          origin_position.push_back(static_cast<int>(unchange_index));
+          unchange_index += 1;
+        }
+      }
+      int64_t numel = 1;
+      for (int64_t i = 0; i < unchange_size; i++) {
+        transpose_shape.push_back(x_dim[unchange_axis[i]]);
+        cumprod_shape.push_back(x_dim[unchange_axis[i]]);
+        transpose_dim.push_back(static_cast<int>(unchange_axis[i]));
+      }
+      for (int64_t i = 0; i < axis_size; i++) {
+        transpose_shape.push_back(x_dim[axis_[i]]);
+        transpose_dim.push_back(static_cast<int>(axis_[i]));
+        numel *= x_dim[axis_[i]];
+      }
+      cumprod_shape.push_back(numel);
+      auto x_transpose = transpose<T>(x, transpose_dim);
+      x_reshape = reshape<T>(x_transpose, cumprod_shape);
+      auto left_cumprod = cumprod<T>(x_reshape, -1, true, false);
+      auto right_cumprod = cumprod<T>(x_reshape, -1, true, true);
+      auto x_grad_tmp = left_cumprod * right_cumprod;
+      auto x_grad_reshape = reshape<T>(x_grad_tmp, transpose_shape);
+      auto x_grad_tmp2 = transpose<T>(x_grad_reshape, origin_position);
+      auto x_grad_res = x_grad_tmp2 * out_grad_tmp;
+      set_output<T>(x_grad_res, x_grad);
+    }
   }
 }
 
diff --git a/paddle/fluid/prim/api/manual_prim/eager_prim_api.cc b/paddle/fluid/prim/api/manual_prim/eager_prim_api.cc
index d667f0fabd71e..ab317a702a85d 100644
--- a/paddle/fluid/prim/api/manual_prim/eager_prim_api.cc
+++ b/paddle/fluid/prim/api/manual_prim/eager_prim_api.cc
@@ -16,8 +16,7 @@
 #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
 #include "paddle/fluid/prim/api/manual_prim/prim_manual_api.h"
 
-namespace paddle {
-namespace prim {
+namespace paddle::prim {
 
 template <>
 Tensor full<Tensor>(const IntArray& shape,
@@ -44,5 +43,4 @@ Tensor slice<Tensor>(const Tensor& input,
   return ::slice_ad_func(input, axes, starts, ends, infer_flags, decrease_axis);
 }
 
-}  // namespace prim
-}  // namespace paddle
+}  // namespace paddle::prim
diff --git a/paddle/fluid/prim/api/manual_prim/static_prim_api.cc b/paddle/fluid/prim/api/manual_prim/static_prim_api.cc
index c45a473b4a8d3..f362440623f5e 100644
--- a/paddle/fluid/prim/api/manual_prim/static_prim_api.cc
+++ b/paddle/fluid/prim/api/manual_prim/static_prim_api.cc
@@ -34,8 +34,7 @@
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/enforce.h"
-namespace paddle {
-namespace prim {
+namespace paddle::prim {
 
 template <>
 Tensor full<DescTensor>(const IntArray& shape,
@@ -152,5 +151,4 @@ Tensor slice<DescTensor>(const Tensor& input,
   return out;
 }
 
-}  // namespace prim
-}  // namespace paddle
+}  // namespace paddle::prim
diff --git a/paddle/fluid/prim/api/manual_prim/utils/static_utils.cc b/paddle/fluid/prim/api/manual_prim/utils/static_utils.cc
index 2f76e8bbd966f..43ab21ccd3e06 100644
--- a/paddle/fluid/prim/api/manual_prim/utils/static_utils.cc
+++ b/paddle/fluid/prim/api/manual_prim/utils/static_utils.cc
@@ -23,8 +23,7 @@
 #include "paddle/fluid/prim/utils/static/static_global_utils.h"
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/core/utils/data_type.h"
-namespace paddle {
-namespace prim {
+namespace paddle::prim {
 using Tensor = paddle::Tensor;
 template <>
 TEST_API Tensor empty<DescTensor>(const paddle::experimental::IntArray& shape,
@@ -69,5 +68,4 @@ void by_pass<DescTensor>(const paddle::Tensor& x, paddle::Tensor* real_out) {
   set_output<DescTensor>(out, real_out);
 }
 
-}  // namespace prim
-}  // namespace paddle
+}  // namespace paddle::prim
diff --git a/paddle/fluid/prim/utils/static/static_global_utils.cc b/paddle/fluid/prim/utils/static/static_global_utils.cc
index 3d1aa2158048d..71179429dc997 100644
--- a/paddle/fluid/prim/utils/static/static_global_utils.cc
+++ b/paddle/fluid/prim/utils/static/static_global_utils.cc
@@ -14,12 +14,10 @@
 
 #include "paddle/fluid/prim/utils/static/static_global_utils.h"
 
-namespace paddle {
-namespace prim {
+namespace paddle::prim {
 StaticCompositeContext* StaticCompositeContext::static_composite_context_ =
     new StaticCompositeContext();
 thread_local bool StaticCompositeContext::enable_bwd_prim_ = false;
 thread_local bool StaticCompositeContext::enable_fwd_prim_ = false;
 thread_local bool StaticCompositeContext::enable_eager_prim_ = false;
-}  // namespace prim
-}  // namespace paddle
+}  // namespace paddle::prim
diff --git a/paddle/fluid/primitive/base/decomp_trans.cc b/paddle/fluid/primitive/base/decomp_trans.cc
index 06df447a600b8..22971d21eec40 100644
--- a/paddle/fluid/primitive/base/decomp_trans.cc
+++ b/paddle/fluid/primitive/base/decomp_trans.cc
@@ -41,7 +41,8 @@ std::unordered_set<std::string> decomp_op_contain_none = {"pd_op.squeeze",
                                                           "pd_op.unsqueeze",
                                                           "pd_op.flatten",
                                                           "pd_op.batch_norm",
-                                                          "pd_op.batch_norm_"};
+                                                          "pd_op.batch_norm_",
+                                                          "pd_op.dropout"};
 //
 std::unordered_set<std::string> dynamic_shape_blacklist = {
     "pd_op.squeeze",
diff --git a/paddle/fluid/primitive/codegen/templates/backend/generated/generated_eager_backend.cc.j2 b/paddle/fluid/primitive/codegen/templates/backend/generated/generated_eager_backend.cc.j2
index 7f9f4b5b8676f..b8910ff5b9d9a 100644
--- a/paddle/fluid/primitive/codegen/templates/backend/generated/generated_eager_backend.cc.j2
+++ b/paddle/fluid/primitive/codegen/templates/backend/generated/generated_eager_backend.cc.j2
@@ -12,7 +12,7 @@ namespace backend {
 
 {%- macro args(inputs, attrs) -%}  {#- Arguments are variable pass into method -#}
   {{common.sequence('', '', ', ', inputs)}}
-  {%- if attrs|length > 0 -%} {{", "}} {%- endif -%} {#- append comma between 
+  {%- if attrs|length > 0 -%} {{", "}} {%- endif -%} {#- append comma between
   nputs and attrs -#}
   {{common.sequence('', '', ', ', attrs)}}
 {%- endmacro -%}
@@ -37,7 +37,7 @@ return ::{{name}}_ad_func({{common.args(input_names, attr_names)}});
 {% for api in apis %}
   {%- if api.is_prim and api.name not in backend_black_list and api.name[-1] !=  '_' -%}
 {{sig(api.name, api.inputs, api.attrs, api.outputs | trip_intermediate)}} {
-{{body(api.name, api.inputs, api.attrs, api.outputs | trip_intermediate)}} 
+{{body(api.name, api.inputs, api.attrs, api.outputs | trip_intermediate)}}
 }
 
   {% endif %}
diff --git a/paddle/fluid/primitive/codegen/templates/backend/generated/generated_static_backend.cc.j2 b/paddle/fluid/primitive/codegen/templates/backend/generated/generated_static_backend.cc.j2
index 26f81d756f0b5..8e4921acbb013 100644
--- a/paddle/fluid/primitive/codegen/templates/backend/generated/generated_static_backend.cc.j2
+++ b/paddle/fluid/primitive/codegen/templates/backend/generated/generated_static_backend.cc.j2
@@ -117,20 +117,20 @@ pir::Value {{attr.name}}_res = std::static_pointer_cast<LazyTensor>({{attr.name~
     {% endif %}
   {% endfor %}
   {%- set input_names = [] -%}
-  {%- for i in inputs -%} 
-    {%- do input_names.append(i.name~'_res') -%} 
+  {%- for i in inputs -%}
+    {%- do input_names.append(i.name~'_res') -%}
   {%- endfor -%}
   {%- if mutable_attribute_as_inputs -%}
-    {%- for i in attrs -%} 
+    {%- for i in attrs -%}
       {%- if i is mutable_attribute -%}
-        {%- do input_names.append(i.name~'_res') -%} 
+        {%- do input_names.append(i.name~'_res') -%}
       {%- endif -%}
     {%- endfor -%}
   {%- endif -%}
   {%- set attr_names = [] -%}
-  {%- for i in attrs -%} 
+  {%- for i in attrs -%}
     {%- if  not mutable_attribute_as_inputs or mutable_attribute_as_inputs and i is not mutable_attribute -%}{#- do nothing -#}
-      {%- do attr_names.append(common.phi2ir_attr(i)) -%} 
+      {%- do attr_names.append(common.phi2ir_attr(i)) -%}
     {%- endif -%}
   {% endfor %}
 auto op_res = paddle::dialect::{{name}}({{common.args(input_names, attr_names)}});
@@ -145,14 +145,14 @@ auto op_res = paddle::dialect::{{name}}({{common.args(input_names, attr_names)}}
   {% set api_outputs = api.outputs | trip_intermediate %}
 {{sig(api.name, api.inputs, api_outputs, api.attrs)}} {
   {% filter indent(2, True) %}
-{{body(api.name, api.inputs, api_outputs, api.attrs)}} 
+{{body(api.name, api.inputs, api_outputs, api.attrs)}}
   {% endfilter %}
 }
 
     {% if api.attrs is exist_mutable_attribute %}
 {{sig(api.name, api.inputs, api_outputs, api.attrs, True)}} {
   {% filter indent(2, True) %}
-{{body(api.name, api.inputs, api_outputs, api.attrs, True)}} 
+{{body(api.name, api.inputs, api_outputs, api.attrs, True)}}
   {% endfilter %}
 }
 
diff --git a/paddle/fluid/primitive/codegen/templates/common.j2 b/paddle/fluid/primitive/codegen/templates/common.j2
index b29401133db03..ecf5e54cae33b 100644
--- a/paddle/fluid/primitive/codegen/templates/common.j2
+++ b/paddle/fluid/primitive/codegen/templates/common.j2
@@ -8,12 +8,12 @@ template <typename T>
   {%- set input_params = [] -%}
   {%- for i in inputs -%} {%- do input_params.append(i.typename|to_paddle_input_type(i.optional)~' '~i.name) -%} {%- endfor -%}
   {%- set attr_params = [] -%}
-  {%- for i in attrs -%} 
+  {%- for i in attrs -%}
     {%- if not mutable_attribute_as_inputs or i is not mutable_attribute -%}
       {%- if default -%}
-        {%- do attr_params.append(i.typename|to_paddle_attr_type~' '~i.name~default_value(i)) -%} 
+        {%- do attr_params.append(i.typename|to_paddle_attr_type~' '~i.name~default_value(i)) -%}
       {%- else -%}
-        {%- do attr_params.append(i.typename|to_paddle_attr_type~' '~i.name) -%} 
+        {%- do attr_params.append(i.typename|to_paddle_attr_type~' '~i.name) -%}
       {%- endif -%}
     {%- else -%}
       {%- do input_params.append('const Tensor&'~' '~i.name~'_') -%}
@@ -43,7 +43,7 @@ template <typename T>
 {%- macro ret(outputs) -%}
   {%- set names = [] -%}
   {%- for i in outputs -%} {%- do names.append(i.typename|to_paddle_output_type(i.optional)) -%} {%- endfor -%}
-  {%- if names|length > 1 -%} 
+  {%- if names|length > 1 -%}
 std::tuple<{{sequence('', '', ', ', names)}}>
   {%- else -%}
 {{names[0]}}
diff --git a/paddle/fluid/primitive/codegen/templates/decomp/generated_decomp_vjp.j2 b/paddle/fluid/primitive/codegen/templates/decomp/generated_decomp_vjp.j2
index 460b8e3a2fcdc..592b45b84aa72 100644
--- a/paddle/fluid/primitive/codegen/templates/decomp/generated_decomp_vjp.j2
+++ b/paddle/fluid/primitive/codegen/templates/decomp/generated_decomp_vjp.j2
@@ -139,13 +139,13 @@ std::vector<std::vector<pir::Value>> {{class_name}}::DecompVjp(pir::Operation* o
     auto stop_gradients_attr = op->attribute(kAttrStopGradients)
                                    .dyn_cast<pir::ArrayAttribute>()
                                    .AsVector();
-    {% for k in range(outputs|length) %}         
+    {% for k in range(outputs|length) %}
     stop_gradients[{{k}}].push_back(
         stop_gradients_attr[{{k}}].dyn_cast<pir::BoolAttribute>().data());
-    {% endfor %} 
+    {% endfor %}
     VLOG(4) << " stop_gradients is set ";
   } else {
-    {% for k in range(outputs|length) %} 
+    {% for k in range(outputs|length) %}
     stop_gradients[{{k}}].push_back(false);
     {% endfor %}
     VLOG(4) << " stop_gradients is not set ";
@@ -160,7 +160,7 @@ std::vector<std::vector<pir::Value>> {{class_name}}::DecompVjp(pir::Operation* o
   VLOG(4) << "Call Pir Decomposed backward op {{fwd_name}}";
 
 
-  {% for k in range(outputs|length) %} 
+  {% for k in range(outputs|length) %}
   paddle::Tensor* {{outputs[k].name}} = !stop_gradients[{{k}}][0] ? &tensor_res[{{k}}][0] : nullptr;
   {% endfor %}
 
diff --git a/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.cc.j2 b/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.cc.j2
index 105175758f22d..31ec42aacd7a9 100644
--- a/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.cc.j2
+++ b/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.cc.j2
@@ -79,11 +79,11 @@ auto {{i.name}} = phi::IntArray(paddle::dialect::GetInt64Vector({{i.name}}_defin
   {%- for api in apis -%} {%- do api_map.update({api.name: api}) -%} {%- endfor -%}
   {%- for i in api.inputs -%} {%- do input_names.append(i.name) -%} {%- endfor -%}
   {%- set attr_names=[] -%}
-  {%- for i in api.attrs -%} 
+  {%- for i in api.attrs -%}
     {%- if i is mutable_attribute -%}
-      {%- do input_names.append(i.name~'_') -%} 
+      {%- do input_names.append(i.name~'_') -%}
     {%- else -%}
-      {%- do attr_names.append(i.name) -%} 
+      {%- do attr_names.append(i.name) -%}
     {%- endif -%}
   {%- endfor %}
   {% if 'invoke' in api and api.invoke.func in api_map %}
@@ -116,7 +116,7 @@ FLAGS_tensor_operants_mode = "static";
 VLOG(4) << "Call Pir Decomposed backward op {{api.name}}";
   {% for i in range(api.outputs|length) %}
     {% if api.outputs[i].typename=='Tensor' %}
-paddle::Tensor* {{api.outputs[i].name}} = !stop_gradients[{{i}}][0] ? &vjp_res[{{i}}][0] : nullptr; 
+paddle::Tensor* {{api.outputs[i].name}} = !stop_gradients[{{i}}][0] ? &vjp_res[{{i}}][0] : nullptr;
     {% else %}
 std::vector<paddle::Tensor*> {{api.outputs[i].name}}(stop_gradients[{{i}}].size(), nullptr);
 for (size_t i=0; i< stop_gradients[{{i}}].size(); i++ ) {
diff --git a/paddle/fluid/primitive/composite/composite.h b/paddle/fluid/primitive/composite/composite.h
index 091d540aa461a..eca7bfb3a616f 100644
--- a/paddle/fluid/primitive/composite/composite.h
+++ b/paddle/fluid/primitive/composite/composite.h
@@ -23,9 +23,6 @@ namespace paddle {
 namespace primitive {
 namespace details {
 
-// empty_shape means x.shape=[]
-static std::vector<int64_t> empty_shape;
-
 template <typename T>
 static Tensor get_slice(const Tensor& x, int64_t idx) {
   return slice<T>(x, {0}, {idx}, {idx + 1}, {1}, {});
@@ -98,7 +95,7 @@ Tensor mean_decomp(const Tensor& x, const IntArray& axis, bool keepdim) {
     for (size_t i = 0; i < axis_.size(); i++) {
       value_ *= x_dim[axis_[i]];
     }
-    value = full<T>(empty_shape, value_, sum_x.dtype());
+    value = full_scalar<T>(value_, sum_x.dtype());
   }
 
   Tensor res = sum_x / value;
@@ -148,7 +145,7 @@ Tensor p_norm_decomp(const Tensor& x,
   Tensor res;
   if (porder == 0.0) {
     // 0-norm
-    auto zero = full<T>(empty_shape, 0, x_tmp.dtype());
+    auto zero = full_scalar<T>(0, x_tmp.dtype());
     auto none_zero = not_equal<T>(x_tmp, zero);
     res = cast<T>(none_zero, x_tmp.dtype());
     res = sum<T>(res, {axis}, x_tmp.dtype(), keepdim);
@@ -169,8 +166,8 @@ Tensor p_norm_decomp(const Tensor& x,
     res = min<T>(x_tmp, {axis}, keepdim);
   } else {
     // vanilla p-norm
-    auto porder_tensor = full<T>(empty_shape, porder, x_tmp.dtype());
-    auto inv_porder_tensor = full<T>(empty_shape, 1 / porder, x_tmp.dtype());
+    auto porder_tensor = full_scalar<T>(porder, x_tmp.dtype());
+    auto inv_porder_tensor = full_scalar<T>(1 / porder, x_tmp.dtype());
     res = elementwise_pow<T>(x_tmp, porder_tensor);
     res = sum<T>(res, {axis}, x_tmp.dtype(), keepdim);
     res = elementwise_pow<T>(res, inv_porder_tensor);
@@ -194,8 +191,7 @@ Tensor pow_decomp(const Tensor& x, const paddle::Scalar& y) {
   }
 
   check_valid_type(y.dtype());
-  Tensor y_full = full<T>(empty_shape, y, x_cast.dtype());
-
+  Tensor y_full = full_scalar<T>(y, x_cast.dtype());
   auto ans = elementwise_pow<T>(x_cast, y_full);
   if (need_cast) {
     return cast<T>(ans, org_dtype);
@@ -229,7 +225,12 @@ Tensor one_hot_decomp(const Tensor& x, const Tensor& num_classes) {
       backend::full_with_tensor<T>(num_classes, 0, x.dtype());
 
   std::vector<int64_t> input_dim;
-  input_dim.push_back(x.shape()[0]);
+  int x_dims = 1;
+  for (size_t i = 0; i < x.shape().size(); i++) {
+    x_dims *= x.shape()[i];
+  }
+
+  input_dim.push_back(x_dims);
   input_dim.push_back(num_classes_tensor.shape()[0]);
   auto input_tensor = full<T>(input_dim, 0, x.dtype());
 
@@ -239,13 +240,13 @@ Tensor one_hot_decomp(const Tensor& x, const Tensor& num_classes) {
   }
   output_dim.push_back(num_classes_tensor.shape()[0]);
 
-  auto end = full<T>({1}, x.shape()[0], x.dtype());
+  auto end = full<T>({1}, x_dims, x.dtype());
   auto start = full<T>({1}, 0, x.dtype());
   auto step = full<T>({1}, 1, x.dtype());
   auto arange_tensor =
       backend::arange_with_tensor<T>(start, end, step, x.dtype());
 
-  std::vector<int64_t> reshape_dim{x.shape()[0], 1};
+  std::vector<int64_t> reshape_dim{x_dims, 1};
   auto x_reshape = reshape<T>(x, reshape_dim);
   auto arange_tensor_reshape = reshape<T>(arange_tensor, reshape_dim);
 
@@ -254,7 +255,7 @@ Tensor one_hot_decomp(const Tensor& x, const Tensor& num_classes) {
   index_concat.push_back(x_reshape);
   auto index_tensor = concat<T>(index_concat, 1);
 
-  auto update_tensor = full<T>({x.shape()[0]}, 1, x.dtype());
+  auto update_tensor = full<T>({x_dims}, 1, x.dtype());
 
   auto ans = reshape<T>(
       cast<T>(scatter_nd_add<T>(input_tensor, index_tensor, update_tensor),
@@ -282,13 +283,13 @@ Tensor squared_l2_norm_decomp(const Tensor& x) {
 
 template <typename T>
 Tensor reciprocal_decomp(const Tensor& x) {
-  return full<T>(empty_shape, 1.0, x.dtype()) / x;
+  return full_scalar<T>(1.0, x.dtype()) / x;
 }
 
 template <typename T>
 Tensor bce_loss_decomp(const Tensor& x, const Tensor& label) {
-  auto one = full<T>(empty_shape, 1, x.dtype());
-  auto ans = full<T>(empty_shape, -1, x.dtype()) *
+  auto one = full_scalar<T>(1, x.dtype());
+  auto ans = full_scalar<T>(-1, x.dtype()) *
              (label * log<T>(x) + (one - label) * log<T>(one - x));
   return ans;
 }
@@ -382,7 +383,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> batch_norm_decomp(
     }
   }
 
-  Tensor half = full<T>(empty_shape, -0.5, x_cast.dtype());
+  Tensor half = full_scalar<T>(-0.5, x_cast.dtype());
 
   bool use_run_stat = (is_test && (!trainable_statistics)) || use_global_stats;
   Tensor x_hat;
@@ -421,9 +422,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> batch_norm_decomp(
     run_var_ = assign<T>(run_var);
   }
   Tensor y;
-  Tensor new_scale =
-      scale ? scale.get() : full<T>(empty_shape, 1, x_cast.dtype());
-  Tensor new_bias = bias ? bias.get() : full<T>(empty_shape, 0, x_cast.dtype());
+  Tensor new_scale = scale ? scale.get() : full_scalar<T>(1, x_cast.dtype());
+  Tensor new_bias = bias ? bias.get() : full_scalar<T>(0, x_cast.dtype());
   if (data_layout_ == DataLayout::kNHWC) {
     y = x_hat * new_scale + new_bias;
   } else {
@@ -441,8 +441,10 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> batch_norm_decomp(
     return std::make_tuple(
         y, run_mean_, run_var_, batch_mean_, inv_std_, reserve_space);
   } else {
+    Tensor batch_mean_none;
+    Tensor inv_std_none;
     return std::make_tuple(
-        y, run_mean_, run_var_, batch_mean_, inv_std_, reserve_space);
+        y, run_mean_, run_var_, batch_mean_none, inv_std_none, reserve_space);
   }
 }
 
@@ -539,13 +541,13 @@ Tensor swiglu_decomp(const Tensor& x, const paddle::optional<Tensor>& y) {
 
 template <typename T>
 Tensor relu_decomp(const Tensor& x) {
-  return maximum<T>(x, full<T>(empty_shape, 0.0, x.dtype()));
+  return maximum<T>(x, full_scalar<T>(0.0, x.dtype()));
 }
 
 template <typename T>
 Tensor relu6_decomp(const Tensor& x) {
-  auto tmp = maximum<T>(x, full<T>(empty_shape, 0.0, x.dtype()));
-  auto res = minimum<T>(tmp, full<T>(empty_shape, 6.0, x.dtype()));
+  auto tmp = maximum<T>(x, full_scalar<T>(0.0, x.dtype()));
+  auto res = minimum<T>(tmp, full_scalar<T>(6.0, x.dtype()));
   return res;
 }
 
@@ -653,7 +655,7 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_decomp(
     auto difference = x_cast - mean_;
     auto var_tmp1 = difference * difference;
     auto variance = mean_decomp<T>(var_tmp1, axis, true);
-    auto var_tmp3 = variance + full<T>(empty_shape, epsilon, variance.dtype());
+    auto var_tmp3 = variance + full_scalar<T>(epsilon, variance.dtype());
     auto rsqrt_var = rsqrt<T>(var_tmp3);
     auto out = difference * rsqrt_var;
 
@@ -798,18 +800,18 @@ std::tuple<Tensor, Tensor> dropout_decomp(
   Tensor uniform_tensor;
   if (has_dynamic_shape(x.shape())) {
     auto shape_tensor = shape<T>(x);
-    auto zero = full<T>(empty_shape, 0.0, dtype_tmp);
-    auto one = full<T>(empty_shape, 1.0, dtype_tmp);
+    auto zero = full_scalar<T>(0.0, dtype_tmp);
+    auto one = full_scalar<T>(1.0, dtype_tmp);
     uniform_tensor =
         backend::uniform<T>(shape_tensor, zero, one, dtype_tmp, seed_tmp);
   } else {
     uniform_tensor =
         uniform<T>(phi::vectorize(x.dims()), dtype_tmp, 0.0, 1.0, seed_tmp);
   }
-  auto mask = cast<T>(
-      greater_equal<T>(uniform_tensor, full<T>(empty_shape, p, dtype_tmp)),
-      org_dtype);
-  auto ones_p = full<T>(empty_shape, 1.0 - p.to<float>(), org_dtype);
+  auto mask =
+      cast<T>(greater_equal<T>(uniform_tensor, full_scalar<T>(p, dtype_tmp)),
+              org_dtype);
+  auto ones_p = full_scalar<T>(1.0 - p.to<float>(), org_dtype);
   if (upscale_in_train) {
     if (is_test) {
       // inference: out = input
@@ -818,7 +820,7 @@ std::tuple<Tensor, Tensor> dropout_decomp(
       // train: out = input * mask / ( 1.0 - p )
       if (p.to<float>() == 1.0) {
         // Process p=1. for avoid divide zero error (x*mask/(1.0-p))
-        auto zero = full<T>(empty_shape, 0.0, org_dtype);
+        auto zero = full_scalar<T>(0.0, org_dtype);
         return std::make_tuple(x * zero, cast<T>(zero, DataType::UINT8));
       } else {
         auto ans = (x * mask) / ones_p;
@@ -842,20 +844,20 @@ Tensor gelu_decomp(const Tensor& x, bool approximate) {
   const double PM_SQRT1_2 = 0.70710678118654752440;  /* 1/sqrt(2) */
 
   auto org_dtype = x.dtype();
-  auto half = full<T>(empty_shape, 0.5, org_dtype);
-  auto one = full<T>(empty_shape, 1.0, org_dtype);
+  auto half = full_scalar<T>(0.5, org_dtype);
+  auto one = full_scalar<T>(1.0, org_dtype);
   if (approximate) {
     // gelu(x) = 0.5 * x * (1 + tanh(sqrt(2 / \pi) * (x + 0.044715 * x^{3})))
-    auto kAlpha = full<T>(empty_shape, PM_2_SQRTPI * PM_SQRT1_2, org_dtype);
-    auto GELU_CONSTANT = full<T>(empty_shape, 0.044715, org_dtype);
-    auto x_pow3 = elementwise_pow<T>(x, full<T>(empty_shape, 3, org_dtype));
+    auto kAlpha = full_scalar<T>(PM_2_SQRTPI * PM_SQRT1_2, org_dtype);
+    auto GELU_CONSTANT = full_scalar<T>(0.044715, org_dtype);
+    auto x_pow3 = elementwise_pow<T>(x, full_scalar<T>(3, org_dtype));
     auto tanh_out = tanh<T>(kAlpha * (x + x_pow3 * GELU_CONSTANT));
 
     auto res = x * half * (one + tanh_out);
     return res;
   } else {
     // gelu(x) = 0.5 * x *  (1 + erf(x / sqrt(2)))
-    auto M_SQRT1_2T = full<T>(empty_shape, PM_SQRT1_2, org_dtype);
+    auto M_SQRT1_2T = full_scalar<T>(PM_SQRT1_2, org_dtype);
     auto erf_out = one + erf<T>(x * M_SQRT1_2T);
 
     auto res = x * half * erf_out;
@@ -867,10 +869,10 @@ template <typename T>
 Tensor hardsigmoid_decomp(const Tensor& x, float slope, float offset) {
   const double MAX_VALUE = 1.0;
   const double MIN_VALUE = 0.0;
-  return maximum<T>(minimum<T>(x * full<T>(empty_shape, slope, x.dtype()) +
-                                   full<T>(empty_shape, offset, x.dtype()),
-                               full<T>(empty_shape, MAX_VALUE, x.dtype())),
-                    full<T>(empty_shape, MIN_VALUE, x.dtype()));
+  return maximum<T>(minimum<T>(x * full_scalar<T>(slope, x.dtype()) +
+                                   full_scalar<T>(offset, x.dtype()),
+                               full_scalar<T>(MAX_VALUE, x.dtype())),
+                    full_scalar<T>(MIN_VALUE, x.dtype()));
 }
 
 template <typename T>
@@ -881,15 +883,15 @@ Tensor hardswish_decomp(const Tensor& x) {
 
   // out = minimum(maximum(x + offset, 0), threshold) * x / scale
   auto minimum_out =
-      minimum<T>(maximum<T>(x + full<T>(empty_shape, OFFSET, x.dtype()),
-                            full<T>(empty_shape, 0.0, x.dtype())),
-                 full<T>(empty_shape, THRESHOLD, x.dtype()));
-  return (minimum_out * x) / full<T>(empty_shape, SCALE, x.dtype());
+      minimum<T>(maximum<T>(x + full_scalar<T>(OFFSET, x.dtype()),
+                            full_scalar<T>(0.0, x.dtype())),
+                 full_scalar<T>(THRESHOLD, x.dtype()));
+  return (minimum_out * x) / full_scalar<T>(SCALE, x.dtype());
 }
 
 template <typename T>
 Tensor leaky_relu_decomp(const Tensor& x, float negative_slope) {
-  auto multiply_tmp = full<T>(empty_shape, negative_slope, x.dtype()) * x;
+  auto multiply_tmp = full_scalar<T>(negative_slope, x.dtype()) * x;
   if (negative_slope < 1.0) {
     return maximum<T>(x, multiply_tmp);
   } else {
@@ -1127,8 +1129,7 @@ std::tuple<Tensor, Tensor, Tensor> group_norm_decomp(
     var_ = maximum<T>(
         var_tmp_,
         backend::full_with_tensor<T>(shape<T>(var_tmp_), 0, var_tmp_.dtype()));
-    Tensor var_inv =
-        rsqrt<T>(var_ + full<T>(empty_shape, epsilon, var_.dtype()));
+    Tensor var_inv = rsqrt<T>(var_ + full_scalar<T>(epsilon, var_.dtype()));
     Tensor res = (x_cast - mean_) * var_inv;
     out = backend::reshape<T>(res, x_dim_t);
   } else {
@@ -1143,7 +1144,7 @@ std::tuple<Tensor, Tensor, Tensor> group_norm_decomp(
     auto var_tmp_ =
         mean_decomp<T>(x_cast * x_cast, c_axis, true) - mean_ * mean_;
     var_ = maximum<T>(var_tmp_, full<T>(var_tmp_.shape(), 0, var_tmp_.dtype()));
-    auto var_inv = rsqrt<T>(var_ + full<T>(empty_shape, epsilon, var_.dtype()));
+    auto var_inv = rsqrt<T>(var_ + full_scalar<T>(epsilon, var_.dtype()));
     auto res = (x_cast - mean_) * var_inv;
     out = reshape<T>(res, x_dim);
   }
@@ -1207,7 +1208,7 @@ Tensor square_decomp(const Tensor& x) {
   }
 
   Tensor two;
-  two = full<T>(empty_shape, 2, x_cast.dtype());
+  two = full_scalar<T>(2, x_cast.dtype());
 
   auto ans = elementwise_pow<T>(x_cast, two);
   if (need_cast) {
@@ -1224,9 +1225,8 @@ Tensor sigmoid_cross_entropy_with_logits_decomp(
     const paddle::optional<Tensor>& pos_weight,
     bool normalize,
     int ignore_index) {
-  auto dims = x.shape();
-  const Tensor zero = full<T>(dims, 0, x.type());
-  const Tensor one = full<T>(dims, 1, x.type());
+  const Tensor zero = full_like_decomp<T>(x, 0, x.type(), x.place());
+  const Tensor one = full_like_decomp<T>(x, 1, x.type(), x.place());
   Tensor pos_weight_tensor;
   if (pos_weight) {
     pos_weight_tensor = pos_weight.get();
@@ -1235,19 +1235,20 @@ Tensor sigmoid_cross_entropy_with_logits_decomp(
   }
   auto term1 = where<T>(x > zero, x, zero);
   auto term2 = x * label;
-  auto term3 = log<T>(1 + exp<T>(-abs<T>(x)));
+  auto term3 = log<T>(one + exp<T>(-abs<T>(x)));
   const Tensor tmp_out = term1 - term2 + term3 * pos_weight_tensor;
-  const Tensor ignore_index_tensor = full<T>(dims, ignore_index, label.type());
+  const Tensor ignore_index_tensor =
+      full_like_decomp<T>(x, ignore_index, label.type(), label.place());
   auto out = where<T>(label == ignore_index_tensor, zero, tmp_out);
   if (normalize) {
     // Follow the implementation in
     // paddle/phi/kernels/cpu/sigmoid_cross_entropy_with_logits_kernel.cc
-    const Tensor eps1 = full<T>(dims, 1e-6, x.type());
+    const Tensor eps1 = full_like_decomp<T>(x, 1e-6, x.type(), x.place());
     auto diff = label - ignore_index_tensor;
     const Tensor tmp_norm = sum<T>(where<T>(abs<T>(diff) > eps1, one, zero));
     // Follow the implementation in
     // paddle/phi/kernels/cpu/sigmoid_cross_entropy_with_logits_kernel.cc
-    const Tensor eps2 = full<T>(empty_shape, 1e-5, x.type());
+    const Tensor eps2 = full_scalar<T>(1e-5, x.type());
     auto norm = where<T>(tmp_norm > eps2, tmp_norm, eps2);
     out = out / norm;
   }
@@ -1387,8 +1388,8 @@ Tensor elu_decomp(const Tensor& x, const float alpha) {
 
   if (has_dynamic_shape(x_cast.shape())) {
     zero = backend::full_with_tensor<T>(shape<T>(x_cast), 0, x_cast.dtype());
-    tmp_res = full<T>(empty_shape, alpha, x_cast.dtype()) *
-              (exp<T>(x_cast) - full<T>(empty_shape, 1, x_cast.dtype()));
+    tmp_res = full_scalar<T>(alpha, x_cast.dtype()) *
+              (exp<T>(x_cast) - full_scalar<T>(1, x_cast.dtype()));
   } else {
     zero = full<T>(x_cast.shape(), 0, x_cast.type());
     tmp_res = alpha * (exp<T>(x_cast) - 1);
diff --git a/paddle/fluid/primitive/manual/manual_primitive.h b/paddle/fluid/primitive/manual/manual_primitive.h
index f2ec3ebce45b3..6587adf862a6e 100644
--- a/paddle/fluid/primitive/manual/manual_primitive.h
+++ b/paddle/fluid/primitive/manual/manual_primitive.h
@@ -30,6 +30,15 @@ Tensor full(const IntArray& shape,
   return backend::full<T>(shape, value, dtype, place);
 }
 
+template <typename T>
+Tensor full_scalar(const Scalar& value,
+                   DataType dtype = DataType::FLOAT32,
+                   Place place = Place()) {
+  // empty_shape means x.shape=[]
+  std::vector<int64_t> empty_shape;
+  return backend::full<T>(empty_shape, value, dtype, place);
+}
+
 template <typename T>
 Tensor assign_out_(const Tensor& x, const Tensor& output) {
   return backend::assign_out_<T>(x, output);
diff --git a/paddle/fluid/primitive/rule/vjp/details.h b/paddle/fluid/primitive/rule/vjp/details.h
index 00e464859e29e..551a67fc22a1b 100644
--- a/paddle/fluid/primitive/rule/vjp/details.h
+++ b/paddle/fluid/primitive/rule/vjp/details.h
@@ -154,41 +154,83 @@ void sum_grad(const Tensor& x,
   if (!x_grad) {
     return;
   }
-  std::vector<int64_t> x_dim = common::vectorize<int64_t>(x.dims());
+
   int64_t axis_size = axis.size();
-  int64_t x_dim_size = x_dim.size();
+  int64_t x_dim_size = x.dims().size();
+  auto x_grad_tmp = Tensor();
   reduce_all = false;
   if (reduce_all || axis_size == 0 || axis_size == x_dim_size) {
     reduce_all = true;
   } else {
     reduce_all = false;
   }
-  auto x_grad_tmp = Tensor();
-  if (x_dim_size == 1) {
-    x_grad_tmp = expand<T>(out_grad, IntArray(x_dim));
-  } else {
-    if (!keepdim) {
-      auto axis_ = std::vector<int64_t>();
-      if (reduce_all) {
-        for (int64_t i = 0; i < x_dim_size; i++) {
-          axis_.push_back(i);
+  if (has_dynamic_shape(x.shape())) {
+    Tensor x_shape = shape<T>(x);
+    if (x_dim_size == 1) {
+      x_grad_tmp = backend::expand<T>(out_grad, x_shape);
+    } else {
+      if (!keepdim) {
+        auto axis_ = std::vector<int64_t>();
+        if (reduce_all) {
+          for (int64_t i = 0; i < x_dim_size; i++) {
+            axis_.push_back(i);
+          }
+        } else {
+          axis_ = axis.GetData();
+          for (int64_t i = 0; i < axis_size; i++) {
+            if (axis[i] < 0) {
+              axis_[i] = axis[i] + x_dim_size;
+            }
+          }
         }
-      } else {
-        axis_ = axis.GetData();
-        for (int64_t i = 0; i < axis_size; i++) {
-          if (axis[i] < 0) {
-            axis_[i] = axis[i] + x_dim_size;
+        Tensor out_grad_shape = shape<T>(out_grad);
+        size_t total_shape_size = out_grad.shape().size() + axis_.size();
+        std::vector<Tensor> result_shape;
+        size_t j = 0, k = 0;
+        Tensor ones = full<T>({1}, 1, x_shape.dtype());
+        for (size_t i = 0; i < total_shape_size; i++) {
+          if (j < axis_.size() && axis_[j] == int64_t(i)) {
+            result_shape.push_back(ones);
+            j++;
+          } else {
+            result_shape.push_back(slice<T>(
+                out_grad_shape, {0}, {int64_t(k)}, {int64_t(k) + 1}, {1}, {}));
+            k++;
           }
         }
+        auto out_grad_ = backend::reshape<T>(out_grad, concat<T>(result_shape));
+        x_grad_tmp = backend::expand<T>(out_grad_, x_shape);
+      } else {
+        x_grad_tmp = backend::expand<T>(out_grad, x_shape);
       }
-      auto out_grad_shape = get_unsqueeze_dims(out_grad, axis_);
-      auto out_grad_ = reshape<T>(out_grad, out_grad_shape);
-      x_grad_tmp = expand<T>(out_grad_, IntArray(x_dim));
-    } else {
+    }
+  } else {
+    std::vector<int64_t> x_dim = common::vectorize<int64_t>(x.dims());
+    if (x_dim_size == 1) {
       x_grad_tmp = expand<T>(out_grad, IntArray(x_dim));
+    } else {
+      if (!keepdim) {
+        auto axis_ = std::vector<int64_t>();
+        if (reduce_all) {
+          for (int64_t i = 0; i < x_dim_size; i++) {
+            axis_.push_back(i);
+          }
+        } else {
+          axis_ = axis.GetData();
+          for (int64_t i = 0; i < axis_size; i++) {
+            if (axis[i] < 0) {
+              axis_[i] = axis[i] + x_dim_size;
+            }
+          }
+        }
+        auto out_grad_shape = get_unsqueeze_dims(out_grad, axis_);
+        auto out_grad_ = reshape<T>(out_grad, out_grad_shape);
+        x_grad_tmp = expand<T>(out_grad_, IntArray(x_dim));
+      } else {
+        x_grad_tmp = expand<T>(out_grad, IntArray(x_dim));
+      }
     }
   }
-
   set_output<T>(x_grad_tmp, x_grad);
 }
 
@@ -899,7 +941,8 @@ template <typename T>
 void sqrt_grad(const Tensor& out, const Tensor& out_grad, Tensor* x_grad) {
   if (x_grad) {
     // This calculation is important for resnet.
-    auto x_grad_tmp = (0.5 / out) * out_grad;
+    auto factor = full_scalar<T>(0.5, out.dtype());
+    auto x_grad_tmp = (factor / out) * out_grad;
     set_output<T>(x_grad_tmp, x_grad);
   }
 }
@@ -908,7 +951,8 @@ template <typename T>
 void rsqrt_grad(const Tensor& out, const Tensor& out_grad, Tensor* x_grad) {
   if (x_grad) {
     // This calculation is important for resnet.
-    auto x_grad_tmp = -0.5 * out * out * out * out_grad;
+    auto factor = full_scalar<T>(-0.5, out.dtype());
+    auto x_grad_tmp = factor * out * out * out * out_grad;
     set_output<T>(x_grad_tmp, x_grad);
   }
 }
@@ -929,7 +973,8 @@ void silu_grad(const Tensor& x,
       auto res = out_grad_cast * sigmoid<T>(x_cast) * (1.0 + x_cast - out_cast);
       set_output<T>(cast<T>(res, org_dtype), x_grad);
     } else {
-      auto res = out_grad * sigmoid<T>(x) * (1.0 + x - out);
+      auto one = full_scalar<T>(1.0, x.dtype());
+      auto res = out_grad * sigmoid<T>(x) * (one + x - out);
       set_output<T>(res, x_grad);
     }
   }
@@ -1483,13 +1528,20 @@ void slice_grad(const Tensor& input,
       paddings.push_back(offsets[i]);
       paddings.push_back((in_dims[i] - out_dims[i]) - offsets[i]);
     }
+    Tensor reshape_out_grad;
+    if (out_grad.shape().size() == 0) {
+      reshape_out_grad = full<T>({1}, 1, input.dtype());
+    } else {
+      reshape_out_grad = out_grad;
+    }
+
     if (decrease_size > 0 &&
         (decrease_size != static_cast<size_t>(in_dims.size()))) {
       auto out_tmp =
-          pad<T>(reshape<T>(out_grad, origin_out_shape), paddings, 0.0);
+          pad<T>(reshape<T>(reshape_out_grad, origin_out_shape), paddings, 0.0);
       set_output<T>(out_tmp, input_grad);
     } else {
-      auto out_tmp = pad<T>(out_grad, paddings, 0.0);
+      auto out_tmp = pad<T>(reshape_out_grad, paddings, 0.0);
       set_output<T>(out_tmp, input_grad);
     }
   }
@@ -1548,7 +1600,8 @@ void leaky_relu_grad(const Tensor& out,
 template <typename T>
 void sigmoid_grad(const Tensor& out, const Tensor& out_grad, Tensor* x_grad) {
   if (x_grad) {
-    set_output<T>(out_grad * (out * (1 - out)), x_grad);
+    auto one_tensor = full_scalar<T>(1.0, out.dtype());
+    set_output<T>(out_grad * (out * (one_tensor - out)), x_grad);
   }
 }
 
@@ -1772,11 +1825,13 @@ void prod_grad(const Tensor& x,
     } else {
       reduce_all = false;
     }
-    auto x_grad_tmp = Tensor();
-    auto out_tmp = Tensor();
+    auto out_grad_tmp = Tensor();
+    auto x_reshape = Tensor();
+    std::vector<int64_t> unchange_axis, change_axis, transpose_shape,
+        cumprod_shape;
+    std::vector<int> transpose_dim, origin_position;
     if (x_dim_size == 1) {
-      x_grad_tmp = out_grad.expand(IntArray(x_dim));
-      out_tmp = out.expand(IntArray(x_dim));
+      out_grad_tmp = out_grad.expand(IntArray(x_dim));
     } else {
       if (!keep_dim) {
         auto axis_ = std::vector<int64_t>();
@@ -1794,16 +1849,69 @@ void prod_grad(const Tensor& x,
         }
         auto out_grad_shape = get_unsqueeze_dims(out_grad, axis_);
         auto out_grad_ = reshape<T>(out_grad, out_grad_shape);
-        x_grad_tmp = out_grad_.expand(IntArray(x_dim));
-        auto out_ = reshape<T>(out, out_grad_shape);
-        out_tmp = out_.expand(IntArray(x_dim));
+        out_grad_tmp = out_grad_.expand(IntArray(x_dim));
       } else {
-        x_grad_tmp = out_grad.expand(IntArray(x_dim));
-        out_tmp = out.expand(IntArray(x_dim));
+        out_grad_tmp = out_grad.expand(IntArray(x_dim));
       }
     }
-    auto x_grad_res = x_grad_tmp * out_tmp * (1 / x);
-    set_output<T>(x_grad_res, x_grad);
+    auto axis_ = std::vector<int64_t>();
+    if (reduce_all) {
+      int64_t numel = 1;
+      for (int64_t i = 0; i < x_dim_size; i++) {
+        axis_.push_back(i);
+        numel *= x_dim[i];
+      }
+      cumprod_shape.push_back(numel);
+      x_reshape = reshape<T>(x, cumprod_shape);
+      auto left_cumprod = cumprod<T>(x_reshape, -1, true, false);
+      auto right_cumprod = cumprod<T>(x_reshape, -1, true, true);
+      auto x_grad_tmp = left_cumprod * right_cumprod;
+      auto x_grad_tmp2 = reshape<T>(x_grad_tmp, x.shape());
+      auto x_grad_res = x_grad_tmp2 * out_grad_tmp;
+      set_output<T>(x_grad_res, x_grad);
+    } else {
+      int64_t unchange_size = x_dim_size - axis_size;
+      int64_t unchange_index = 0;
+      for (int64_t i = 0; i < axis_size; i++) {
+        if (axis[i] < 0) {
+          axis_.push_back(axis[i] + x_dim_size);
+        } else {
+          axis_.push_back(axis[i]);
+        }
+      }
+      for (int64_t i = 0; i < x_dim_size; i++) {
+        auto it = find(axis_.begin(), axis_.end(), i);
+        if (it != axis_.end()) {
+          int64_t index = it - axis_.begin();
+          origin_position.push_back(static_cast<int>(unchange_size + index));
+        } else {
+          unchange_axis.push_back(i);
+          origin_position.push_back(static_cast<int>(unchange_index));
+          unchange_index += 1;
+        }
+      }
+      int64_t numel = 1;
+      for (int64_t i = 0; i < unchange_size; i++) {
+        transpose_shape.push_back(x_dim[unchange_axis[i]]);
+        cumprod_shape.push_back(x_dim[unchange_axis[i]]);
+        transpose_dim.push_back(static_cast<int>(unchange_axis[i]));
+      }
+      for (int64_t i = 0; i < axis_size; i++) {
+        transpose_shape.push_back(x_dim[axis_[i]]);
+        transpose_dim.push_back(static_cast<int>(axis_[i]));
+        numel *= x_dim[axis_[i]];
+      }
+      cumprod_shape.push_back(numel);
+      auto x_transpose = transpose<T>(x, transpose_dim);
+      x_reshape = reshape<T>(x_transpose, cumprod_shape);
+      auto left_cumprod = cumprod<T>(x_reshape, -1, true, false);
+      auto right_cumprod = cumprod<T>(x_reshape, -1, true, true);
+      auto x_grad_tmp = left_cumprod * right_cumprod;
+      auto x_grad_reshape = reshape<T>(x_grad_tmp, transpose_shape);
+      auto x_grad_tmp2 = transpose<T>(x_grad_reshape, origin_position);
+      auto x_grad_res = x_grad_tmp2 * out_grad_tmp;
+      set_output<T>(x_grad_res, x_grad);
+    }
   }
 }
 
diff --git a/paddle/fluid/primitive/rule/vjp/manual/manual_vjp.cc b/paddle/fluid/primitive/rule/vjp/manual/manual_vjp.cc
index 623225bb8c09b..b6b3461f3aca0 100644
--- a/paddle/fluid/primitive/rule/vjp/manual/manual_vjp.cc
+++ b/paddle/fluid/primitive/rule/vjp/manual/manual_vjp.cc
@@ -23,8 +23,7 @@
 #include "paddle/fluid/primitive/utils/utils.h"
 #include "paddle/pir/include/core/operation.h"
 
-namespace paddle {
-namespace primitive {
+namespace paddle::primitive {
 
 std::vector<std::vector<paddle::Tensor>> add_n_vjp(
     const std::vector<paddle::Tensor>& x,
@@ -186,5 +185,4 @@ std::vector<std::vector<paddle::Tensor>> fused_attention_vjp(
   return vjp_res;
 }
 
-}  // namespace primitive
-}  // namespace paddle
+}  // namespace paddle::primitive
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 6deffc89271f9..a3086b7d7e34a 100755
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -14,7 +14,6 @@ set(PYBIND_DEPS
     pass
     generate_pass
     pass_builder
-    parallel_executor
     compiled_program
     profiler
     layer
@@ -130,7 +129,6 @@ set(PYBIND_SRCS
     protobuf.cc
     exception.cc
     op_function_common.cc
-    parallel_executor.cc
     compiled_program.cc
     tensor.cc
     place.cc
@@ -310,18 +308,12 @@ if(WITH_PYTHON)
     list(REMOVE_ITEM GENERATOR_DEPS python)
   endif()
   target_link_libraries(eager_legacy_op_function_generator ${GENERATOR_DEPS})
-  if(NOT WIN32)
-    add_executable(kernel_signature_generator kernel_signature_generator.cc)
-    target_link_libraries(kernel_signature_generator
-                          ${OP_FUNCTION_GENERATOR_DEPS})
-  endif()
 
   get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
   target_link_libraries(eager_legacy_op_function_generator
                         ${os_dependency_modules})
   if(WITH_ROCM)
     target_link_libraries(eager_legacy_op_function_generator ${ROCM_HIPRTC_LIB})
-    target_link_libraries(kernel_signature_generator ${ROCM_HIPRTC_LIB})
   endif()
 
   set(op_function_output_path ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/)
diff --git a/paddle/fluid/pybind/auto_parallel_py.cc b/paddle/fluid/pybind/auto_parallel_py.cc
index 87895d6b4df31..f8c53a52e8d46 100644
--- a/paddle/fluid/pybind/auto_parallel_py.cc
+++ b/paddle/fluid/pybind/auto_parallel_py.cc
@@ -264,8 +264,8 @@ void BindAutoParallel(py::module *m) {
                    &ProcessMesh::dim_size))
           .def("empty", &ProcessMesh::empty)
           .def("contains", &ProcessMesh::contains)
-          .def(py::self == py::self)
-          .def(py::self != py::self)
+          .def(py::self == py::self)  // NOLINT
+          .def(py::self != py::self)  // NOLINT
           .def("__copy__",
                [](const ProcessMesh &self) { return ProcessMesh(self); })
           .def(
@@ -298,8 +298,8 @@ void BindAutoParallel(py::module *m) {
       .def_property_readonly("machine_id", &Device::machine_id)
       .def_property_readonly("type", &Device::type)
       .def_property("capability", &Device::capability, &Device::set_capability)
-      .def(py::self == py::self)
-      .def(py::self != py::self)
+      .def(py::self == py::self)  // NOLINT
+      .def(py::self != py::self)  // NOLINT
       .def("__str__", &Device::to_string);
 
   py::class_<LinkCapability>(*m, "LinkCapability")
@@ -317,8 +317,8 @@ void BindAutoParallel(py::module *m) {
       .def_property_readonly("target_id", &Link::target_id)
       .def_property_readonly("type", &Link::type)
       .def_property("capability", &Link::capability, &Link::set_capability)
-      .def(py::self == py::self)
-      .def(py::self != py::self)
+      .def(py::self == py::self)  // NOLINT
+      .def(py::self != py::self)  // NOLINT
       .def("__str__", &Link::to_string);
 
   py::class_<Machine>(*m, "Machine")
@@ -362,8 +362,8 @@ void BindAutoParallel(py::module *m) {
       .def("dim_size",
            static_cast<int64_t (DeviceMesh::*)(const std::string &) const>(
                &DeviceMesh::dim_size))
-      .def(py::self == py::self)
-      .def(py::self != py::self)
+      .def(py::self == py::self)  // NOLINT
+      .def(py::self != py::self)  // NOLINT
       .def("__copy__",
            [](const TensorDistAttr &self) { return TensorDistAttr(self); })
       .def(
@@ -435,8 +435,8 @@ void BindAutoParallel(py::module *m) {
           .def("is_partial", &phi::distributed::Placement::is_partial)
           .def("__hash__", &phi::distributed::Placement::hash)
           .def("__str__", &phi::distributed::Placement::to_string)
-          .def(py::self == py::self)
-          .def(py::self != py::self);
+          .def(py::self == py::self)   // NOLINT
+          .def(py::self != py::self);  // NOLINT
 
   auto Shard = py::class_<phi::distributed::Shard,
                           std::shared_ptr<phi::distributed::Shard>>(
@@ -464,8 +464,8 @@ void BindAutoParallel(py::module *m) {
                    .def("get_dim", &phi::distributed::Shard::get_dim)
                    .def("__hash__", &phi::distributed::Shard::hash)
                    .def("__str__", &phi::distributed::Shard::to_string)
-                   .def(py::self == py::self)
-                   .def(py::self != py::self);
+                   .def(py::self == py::self)   // NOLINT
+                   .def(py::self != py::self);  // NOLINT
 
   auto Replicate = py::class_<phi::distributed::Replicate,
                               std::shared_ptr<phi::distributed::Replicate>>(
@@ -487,8 +487,8 @@ void BindAutoParallel(py::module *m) {
                        .def(py::init<>())
                        .def("__hash__", &phi::distributed::Replicate::hash)
                        .def("__str__", &phi::distributed::Replicate::to_string)
-                       .def(py::self == py::self)
-                       .def(py::self != py::self);
+                       .def(py::self == py::self)   // NOLINT
+                       .def(py::self != py::self);  // NOLINT
 
   auto Partial = py::class_<phi::distributed::Partial,
                             std::shared_ptr<phi::distributed::Partial>>(
@@ -514,8 +514,8 @@ void BindAutoParallel(py::module *m) {
                           py::arg("reduce_type") = phi::ReduceType::kRedSum)
                      .def("__hash__", &phi::distributed::Partial::hash)
                      .def("__str__", &phi::distributed::Partial::to_string)
-                     .def(py::self == py::self)
-                     .def(py::self != py::self);
+                     .def(py::self == py::self)   // NOLINT
+                     .def(py::self != py::self);  // NOLINT
 
   g_placement_shard_pytype = reinterpret_cast<PyTypeObject *>(Shard.ptr());
   g_placement_replicated_pytype =
@@ -565,8 +565,8 @@ void BindAutoParallel(py::module *m) {
              return py::bytes(self.serialize_to_string());
            })
       .def("parse_from_string", &TensorDistAttr::parse_from_string)
-      .def(py::self == py::self)
-      .def(py::self != py::self)
+      .def(py::self == py::self)  // NOLINT
+      .def(py::self != py::self)  // NOLINT
       .def("__copy__",
            [](const TensorDistAttr &self) { return TensorDistAttr(self); })
       .def(
@@ -719,8 +719,8 @@ void BindAutoParallel(py::module *m) {
              return py::bytes(self.serialize_to_string());
            })
       .def("parse_from_string", &OperatorDistAttr::parse_from_string)
-      .def(py::self == py::self)
-      .def(py::self != py::self)
+      .def(py::self == py::self)  // NOLINT
+      .def(py::self != py::self)  // NOLINT
       .def("__copy__",
            [](const OperatorDistAttr &self) { return OperatorDistAttr(self); })
       .def(
diff --git a/paddle/fluid/pybind/control_flow_api.cc b/paddle/fluid/pybind/control_flow_api.cc
index 036c4d9fd8200..61be0eb61fb3e 100644
--- a/paddle/fluid/pybind/control_flow_api.cc
+++ b/paddle/fluid/pybind/control_flow_api.cc
@@ -52,7 +52,6 @@ using pir::Builder;
 using pir::CombineOp;
 using pir::Operation;
 using pir::Program;
-using pir::Region;
 using pir::StackCreateOp;
 using pir::TuplePopOp;
 using pir::TuplePushOp;
@@ -271,8 +270,7 @@ void BuildPipeForBlock(Block* block) {
 
 }  // namespace
 
-namespace paddle {
-namespace pybind {
+namespace paddle::pybind {
 PyIfOp::PyIfOp(IfOp if_op) : IfOp(if_op) {
   PADDLE_ENFORCE_NOT_NULL(
       if_op,
@@ -413,5 +411,4 @@ void BindControlFlowApi(py::module* m) {
   BindTuplePopOp(m);
 }
 
-}  // namespace pybind
-}  // namespace paddle
+}  // namespace paddle::pybind
diff --git a/paddle/fluid/pybind/data_set_py.cc b/paddle/fluid/pybind/data_set_py.cc
index bda9e8e653ef0..0c2f883de904d 100644
--- a/paddle/fluid/pybind/data_set_py.cc
+++ b/paddle/fluid/pybind/data_set_py.cc
@@ -39,8 +39,7 @@ limitations under the License. */
 
 namespace py = pybind11;
 
-namespace paddle {
-namespace pybind {
+namespace paddle::pybind {
 
 class IterableDatasetWrapper {
  public:
@@ -400,5 +399,4 @@ void BindDataset(py::module *m) {
       .def("_next", &IterableDatasetWrapper::Next);
 }
 
-}  // namespace pybind
-}  // namespace paddle
+}  // namespace paddle::pybind
diff --git a/paddle/fluid/pybind/dist_api.cc b/paddle/fluid/pybind/dist_api.cc
index 31a32c3e27a14..4907f52979277 100644
--- a/paddle/fluid/pybind/dist_api.cc
+++ b/paddle/fluid/pybind/dist_api.cc
@@ -26,8 +26,7 @@
 
 namespace py = pybind11;
 
-namespace pybind11 {
-namespace detail {
+namespace pybind11::detail {
 template <typename Key,
           typename Value,
           typename Hash,
@@ -37,15 +36,13 @@ struct type_caster<paddle::flat_hash_map<Key, Value, Hash, Equal, Alloc>>
     : map_caster<paddle::flat_hash_map<Key, Value, Hash, Equal, Alloc>,
                  Key,
                  Value> {};
-}  // namespace detail
-}  // namespace pybind11
+}  // namespace pybind11::detail
 
 using paddle::dialect::OperationDistAttribute;
 using paddle::dialect::ProcessMeshAttribute;
 using paddle::dialect::TensorDistAttribute;
 
-namespace paddle {
-namespace pybind {
+namespace paddle::pybind {
 
 void BindOperationDistAttribute(py::module *m) {
   py::class_<OperationDistAttribute, pir::Attribute> dist_attr(
@@ -150,5 +147,4 @@ void BindDistApi(pybind11::module *module) {
   BindOpsFunction(&ops_modules);
 }
 
-}  // namespace pybind
-}  // namespace paddle
+}  // namespace paddle::pybind
diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc
index 00b6ba994233f..4fbb8c3d48e2d 100644
--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
@@ -1416,7 +1416,7 @@ void BindEager(pybind11::module* module) {
   Py_INCREF(&PyBaseObject_Type);
   type->tp_base = reinterpret_cast<PyTypeObject*>(&PyBaseObject_Type);
   type->tp_flags |=
-      Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
+      Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;  // NOLINT
 #if PY_VERSION_HEX >= 0x03050000
   type->tp_as_async = &heap_type->as_async;
 #endif
@@ -1464,7 +1464,7 @@ void BindEagerStringTensor(pybind11::module* module) {
   Py_INCREF(&PyBaseObject_Type);
   type->tp_base = reinterpret_cast<PyTypeObject*>(&PyBaseObject_Type);
   type->tp_flags |=
-      Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
+      Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;  // NOLINT
 #if PY_VERSION_HEX >= 0x03050000
   type->tp_as_async = &heap_type->as_async;
 #endif
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index 11298fda6a300..6b3c15b55666e 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -72,8 +72,7 @@ typedef SSIZE_T ssize_t;
 COMMON_DECLARE_bool(set_to_1d);
 COMMON_DECLARE_bool(use_stride_kernel);
 
-namespace paddle {
-namespace pybind {
+namespace paddle::pybind {
 
 extern void InitTensorWithNumpyValue(TensorObject* self,
                                      const pybind11::object& array,
@@ -3518,5 +3517,4 @@ PyMethodDef string_tensor_variable_methods[] = {  // NOLINT
      nullptr},
     // TODO(zhoushunjie): Need to add _copy_to, copy_ for StringTensor.
     {nullptr, nullptr, 0, nullptr}};
-}  // namespace pybind
-}  // namespace paddle
+}  // namespace paddle::pybind
diff --git a/paddle/fluid/pybind/eager_py_layer.cc b/paddle/fluid/pybind/eager_py_layer.cc
index 0d9aff8c7ef32..a51fbd72947f3 100644
--- a/paddle/fluid/pybind/eager_py_layer.cc
+++ b/paddle/fluid/pybind/eager_py_layer.cc
@@ -778,7 +778,7 @@ void BindEagerPyLayer(PyObject* module) {
   Py_INCREF(&PyBaseObject_Type);
   type->tp_base = reinterpret_cast<PyTypeObject*>(&PyBaseObject_Type);
   type->tp_flags |=
-      Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
+      Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;  // NOLINT
 #if PY_VERSION_HEX >= 0x03050000
   type->tp_as_async = &heap_type->as_async;
 #endif
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index 6044d2aa567e2..2fcbe08afdded 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -50,8 +50,7 @@ limitations under the License. */
 
 COMMON_DECLARE_bool(check_nan_inf);
 COMMON_DECLARE_int32(check_nan_inf_level);
-namespace paddle {
-namespace pybind {
+namespace paddle::pybind {
 
 extern PyTypeObject* p_tensor_type;
 extern PyTypeObject* p_string_tensor_type;
@@ -2824,5 +2823,4 @@ void BindEagerUtils(PyObject* module) {
   }
 }
 
-}  // namespace pybind
-}  // namespace paddle
+}  // namespace paddle::pybind
diff --git a/paddle/fluid/pybind/exception.cc b/paddle/fluid/pybind/exception.cc
index 7061b844987fa..1a2727504e197 100644
--- a/paddle/fluid/pybind/exception.cc
+++ b/paddle/fluid/pybind/exception.cc
@@ -16,8 +16,7 @@ limitations under the License. */
 
 #include "paddle/common/exception.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
-namespace paddle {
-namespace pybind {
+namespace paddle::pybind {
 
 /* Paddle Exception mapping rules:
  *   - InvalidArgumentError -> ValueError
@@ -139,5 +138,4 @@ void ThrowExceptionToPython(std::exception_ptr p) {
     PyErr_SetString(PyExc_OSError, e.what());
   }
 }
-}  // namespace pybind
-}  // namespace paddle
+}  // namespace paddle::pybind
diff --git a/paddle/fluid/pybind/generator_py.cc b/paddle/fluid/pybind/generator_py.cc
index 41a98f7316766..a1ef869c087cb 100644
--- a/paddle/fluid/pybind/generator_py.cc
+++ b/paddle/fluid/pybind/generator_py.cc
@@ -29,8 +29,7 @@ limitations under the License. */
 
 namespace py = pybind11;
 
-namespace paddle {
-namespace pybind {
+namespace paddle::pybind {
 void BindGenerator(py::module* m_ptr) {
   auto& m = *m_ptr;
   py::class_<phi::Generator::GeneratorState,
@@ -93,5 +92,4 @@ void BindGenerator(py::module* m_ptr) {
   m.def("set_random_seed_generator", &phi::SetRandomSeedGenerator);
   m.def("get_random_seed_generator", &phi::GetRandomSeedGenerator);
 }
-}  // namespace pybind
-}  // namespace paddle
+}  // namespace paddle::pybind
diff --git a/paddle/fluid/pybind/global_value_getter_setter.cc b/paddle/fluid/pybind/global_value_getter_setter.cc
index 68c5ae15f299a..ba37696f64d3e 100644
--- a/paddle/fluid/pybind/global_value_getter_setter.cc
+++ b/paddle/fluid/pybind/global_value_getter_setter.cc
@@ -45,8 +45,7 @@ PD_DECLARE_int32(rpc_get_thread_num);
 PD_DECLARE_int32(rpc_prefetch_thread_num);
 #endif
 
-namespace paddle {
-namespace pybind {
+namespace paddle::pybind {
 
 namespace py = pybind11;
 
@@ -306,5 +305,4 @@ static void RegisterGlobalVarGetterSetter() {
   }
 }
 
-}  // namespace pybind
-}  // namespace paddle
+}  // namespace paddle::pybind
diff --git a/paddle/fluid/pybind/gloo_context_py.cc b/paddle/fluid/pybind/gloo_context_py.cc
index 4b2117306a2d5..6b44a7311d72a 100644
--- a/paddle/fluid/pybind/gloo_context_py.cc
+++ b/paddle/fluid/pybind/gloo_context_py.cc
@@ -35,8 +35,7 @@ limitations under the License. */
 #include "paddle/fluid/memory/allocation/mmap_allocator.h"
 #include "paddle/fluid/platform/gloo_context.h"
 
-namespace paddle {
-namespace pybind {
+namespace paddle::pybind {
 
 namespace py = ::pybind11;
 
@@ -109,5 +108,4 @@ void BindGlooContext(py::module *m) {
 #endif
 }
 
-}  // namespace pybind
-}  // namespace paddle
+}  // namespace paddle::pybind
diff --git a/paddle/fluid/pybind/graph.cc b/paddle/fluid/pybind/graph.cc
index 6acba237ba928..4e5329bbf2bfc 100644
--- a/paddle/fluid/pybind/graph.cc
+++ b/paddle/fluid/pybind/graph.cc
@@ -47,8 +47,7 @@ using paddle::framework::ir::NodeComp;
 using paddle::framework::ir::TopologySortOperations;
 using pybind11::return_value_policy;
 
-namespace paddle {
-namespace pybind {
+namespace paddle::pybind {
 void BindGraph(py::module *m) {
   m->def("graph_safe_remove_nodes",
          [](Graph *graph, const std::unordered_set<const Node *> &nodes) {
@@ -408,5 +407,4 @@ void BindPass(py::module *m) {
          });
 }
 
-}  // namespace pybind
-}  // namespace paddle
+}  // namespace paddle::pybind
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index 0b3d79b6e4ea4..f12828ba6ef80 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -52,8 +52,7 @@
 
 namespace py = pybind11;  // NOLINT
 
-namespace pybind11 {
-namespace detail {
+namespace pybind11::detail {
 
 // Note: use same enum number of float16 in numpy.
 // import numpy as np
@@ -79,11 +78,9 @@ struct npy_format_descriptor<phi::dtype::float16> {
   static constexpr auto name = _("float16");
 };
 
-}  // namespace detail
-}  // namespace pybind11
+}  // namespace pybind11::detail
 
-namespace paddle {
-namespace pybind {
+namespace paddle::pybind {
 using paddle::AnalysisPredictor;
 using paddle::NativeConfig;
 using paddle::NativePaddlePredictor;
@@ -1345,5 +1342,4 @@ void BindInternalUtils(py::module *m) {
                   });
 }
 }  // namespace
-}  // namespace pybind
-}  // namespace paddle
+}  // namespace paddle::pybind
diff --git a/paddle/fluid/pybind/kernel_signature_generator.cc b/paddle/fluid/pybind/kernel_signature_generator.cc
deleted file mode 100644
index 23892fabe1c24..0000000000000
--- a/paddle/fluid/pybind/kernel_signature_generator.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <iostream>
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/phi_utils.h"
-#include "paddle/fluid/pybind/pybind.h"  // NOLINT
-#include "paddle/phi/core/compat/op_utils.h"
-#include "paddle/phi/core/kernel_factory.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/declarations.h"
-
-// print names of kernel function params with json format:
-// {
-// "norm":{
-//   "inputs":[
-//     "X"
-//   ],
-//   "attrs":[
-//     "axis",
-//     "epsilon",
-//     "is_test"
-//   ],
-//   "outputs":[
-//     "Norm",
-//     "Out"
-//   ]
-// },
-// ...
-// }
-int main(int argc, char **argv) {  // NOLINT
-  paddle::framework::InitDefaultKernelSignatureMap();
-  auto &kernel_signature_map = phi::DefaultKernelSignatureMap::Instance();
-  auto &kernel_factory = phi::KernelFactory::Instance();
-  std::string kernel_signature_map_str{"{"};
-  for (const auto &op_kernel_pair : kernel_factory.kernels()) {
-    std::string op_name = op_kernel_pair.first;
-    const paddle::flat_hash_map<std::string, std::string> &kernel_name_map =
-        phi::OpUtilsMap::Instance().fluid_op_to_phi_kernel();
-    for (auto &it : kernel_name_map) {
-      if (it.second == op_name) {
-        op_name = it.first;
-        break;
-      }
-    }
-    if (kernel_signature_map.Has(op_name)) {
-      kernel_signature_map_str.append("\"")
-          .append(op_kernel_pair.first)
-          .append("\":{");
-      const auto &args = kernel_signature_map.Get(op_name);
-
-      kernel_signature_map_str += "\"inputs\":[";
-      auto inputs_ = args.input_names;
-      for (size_t i = 0; i < inputs_.size(); i++) {
-        kernel_signature_map_str.append("\"").append(inputs_[i]).append("\",");
-      }
-      if (!inputs_.empty()) kernel_signature_map_str.pop_back();
-
-      kernel_signature_map_str += "],\"attrs\":[";
-      auto attrs_ = args.attr_names;
-      for (size_t i = 0; i < attrs_.size(); i++) {
-        kernel_signature_map_str.append("\"").append(attrs_[i]).append("\",");
-      }
-      if (!attrs_.empty()) kernel_signature_map_str.pop_back();
-      kernel_signature_map_str += "],\"outputs\":[";
-      auto outputs_ = args.output_names;
-      for (size_t i = 0; i < outputs_.size(); i++) {
-        kernel_signature_map_str.append("\"").append(outputs_[i]).append("\",");
-      }
-
-      if (!outputs_.empty()) kernel_signature_map_str.pop_back();
-      kernel_signature_map_str += "]},";
-    }
-  }
-  kernel_signature_map_str.pop_back();
-  kernel_signature_map_str += "}\n";
-  std::cout << kernel_signature_map_str;
-  return 0;
-}
diff --git a/paddle/fluid/pybind/manual_static_op_function.h b/paddle/fluid/pybind/manual_static_op_function.h
index 872be599d9a76..f41950db85e6d 100644
--- a/paddle/fluid/pybind/manual_static_op_function.h
+++ b/paddle/fluid/pybind/manual_static_op_function.h
@@ -81,7 +81,7 @@ static PyObject *static_api_set_parameter(PyObject *self,
   }
 }
 
-static PyObject *static_api_updata_parameter(PyObject *self,
+static PyObject *static_api_update_parameter(PyObject *self,
                                              PyObject *args,
                                              PyObject *kwargs) {
   try {
@@ -98,7 +98,7 @@ static PyObject *static_api_updata_parameter(PyObject *self,
     // Call ir static api
     CallStackRecorder callstack_recoder("uodata_parameter");
     callstack_recoder.Record();
-    paddle::dialect::updata_parameter(parameter, name);
+    paddle::dialect::update_parameter(parameter, name);
     callstack_recoder.AttachToOps();
     Py_RETURN_NONE;
   } catch (...) {
@@ -975,10 +975,10 @@ static PyMethodDef ManualOpsAPI[] = {
      (PyCFunction)(void (*)(void))static_api_set_parameter,
      METH_VARARGS | METH_KEYWORDS,
      "C++ interface function for set_parameter."},
-    {"updata_parameter",
-     (PyCFunction)(void (*)(void))static_api_updata_parameter,
+    {"update_parameter",
+     (PyCFunction)(void (*)(void))static_api_update_parameter,
      METH_VARARGS | METH_KEYWORDS,
-     "C++ interface function for updata_parameter."},
+     "C++ interface function for update_parameter."},
     {"set_persistable_value",
      (PyCFunction)(void (*)(void))static_api_set_persistable_value,
      METH_VARARGS | METH_KEYWORDS,
diff --git a/paddle/fluid/pybind/op_function_common.cc b/paddle/fluid/pybind/op_function_common.cc
index d2478e592354f..2b57b27fb45f5 100644
--- a/paddle/fluid/pybind/op_function_common.cc
+++ b/paddle/fluid/pybind/op_function_common.cc
@@ -38,8 +38,7 @@
 #include "paddle/pir/include/core/op_result.h"
 #include "paddle/pir/include/core/value.h"
 
-namespace paddle {
-namespace pybind {
+namespace paddle::pybind {
 
 class OpAttrTypeMap {
  public:
@@ -1147,5 +1146,4 @@ ssize_t GetIdxFromCoreOpsInfoMap(
   return -1;
 }
 
-}  // namespace pybind
-}  // namespace paddle
+}  // namespace paddle::pybind
diff --git a/paddle/fluid/pybind/parallel_executor.cc b/paddle/fluid/pybind/parallel_executor.cc
deleted file mode 100644
index 7f6b054564bc6..0000000000000
--- a/paddle/fluid/pybind/parallel_executor.cc
+++ /dev/null
@@ -1,1178 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <Python.h>
-// Avoid a problem with copysign defined in pyconfig.h on Windows.
-#ifdef copysign
-#undef copysign
-#endif
-
-#include <algorithm>
-#include <cctype>
-#include <cstdlib>
-#include <iterator>
-#include <map>
-#include <memory>
-#include <mutex>  // NOLINT // for call_once
-#include <string>
-#include <tuple>
-#include <type_traits>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/custom_operator.h"
-#include "paddle/fluid/framework/data_layout.h"
-#include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/executor_cache.h"
-#include "paddle/fluid/framework/executor_gc_helper.h"
-#include "paddle/fluid/framework/feed_fetch_method.h"
-#include "paddle/fluid/framework/feed_fetch_type.h"
-#include "paddle/fluid/framework/garbage_collector.h"
-#include "paddle/fluid/framework/io/fs.h"
-#include "paddle/fluid/framework/ir/coalesce_grad_tensor_pass.h"
-#include "paddle/fluid/framework/ir/cost_model.h"
-#include "paddle/fluid/framework/ir/generate_pass.h"
-#include "paddle/fluid/framework/ir/pass_builder.h"
-#include "paddle/fluid/framework/lod_rank_table.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/new_executor/executor_statistics.h"
-#include "paddle/fluid/framework/new_executor/standalone_executor.h"
-#include "paddle/fluid/framework/op_info.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/framework/parallel_executor.h"
-#include "paddle/fluid/framework/phi_utils.h"
-#include "paddle/fluid/framework/prune.h"
-#include "paddle/fluid/framework/reader.h"
-#include "paddle/fluid/framework/scope_pool.h"
-#include "paddle/fluid/framework/selected_rows_utils.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/framework/trainer.h"
-#include "paddle/fluid/framework/type_defs.h"
-#include "paddle/fluid/framework/version.h"
-#include "paddle/fluid/imperative/amp_auto_cast.h"
-#include "paddle/fluid/imperative/layer.h"
-#include "paddle/fluid/memory/allocation/allocator_strategy.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/memory/allocation/cuda_ipc_allocator.h"
-#endif
-#include "paddle/fluid/memory/allocation/mmap_allocator.h"
-#include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/operators/common_infer_shape_functions.h"
-#include "paddle/fluid/platform/cpu_helper.h"
-#include "paddle/fluid/platform/device/device_wrapper.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/init.h"
-#include "paddle/fluid/platform/monitor.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/profiler.h"
-#include "paddle/fluid/platform/profiler/event_python.h"
-#include "paddle/fluid/platform/profiler/event_tracing.h"
-#include "paddle/fluid/platform/profiler/profiler.h"
-#include "paddle/fluid/pybind/bind_cost_model.h"
-#include "paddle/fluid/pybind/bind_fleet_executor.h"
-#include "paddle/fluid/pybind/box_helper_py.h"
-#include "paddle/fluid/pybind/communication.h"
-#include "paddle/fluid/pybind/compatible.h"
-#include "paddle/fluid/pybind/const_value.h"
-#include "paddle/fluid/pybind/cuda_streams_py.h"
-#include "paddle/fluid/pybind/data_set_py.h"
-#include "paddle/fluid/pybind/distributed_py.h"
-#include "paddle/fluid/pybind/eager.h"
-#include "paddle/fluid/pybind/exception.h"
-#include "paddle/fluid/pybind/fleet_wrapper_py.h"
-#include "paddle/fluid/pybind/generator_py.h"
-#include "paddle/fluid/pybind/global_value_getter_setter.h"
-#include "paddle/fluid/pybind/gloo_context_py.h"
-#include "paddle/fluid/pybind/gloo_wrapper_py.h"
-#include "paddle/fluid/pybind/graph.h"
-#include "paddle/fluid/pybind/heter_wrapper_py.h"
-#include "paddle/fluid/pybind/imperative.h"
-#include "paddle/fluid/pybind/inference_api.h"
-#include "paddle/fluid/pybind/io.h"
-#include "paddle/fluid/pybind/metrics_py.h"
-#include "paddle/fluid/pybind/ps_gpu_wrapper_py.h"
-#include "paddle/fluid/pybind/pybind_variant_caster.h"
-#include "paddle/phi/backends/cpu/cpu_info.h"
-#include "paddle/phi/backends/device_manager.h"
-#include "paddle/phi/core/compat/convert_utils.h"
-#include "paddle/phi/core/lod_utils.h"
-#include "paddle/utils/none.h"
-
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/pybind/nccl_wrapper_py.h"
-#endif
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/pybind/protobuf.h"
-#include "paddle/fluid/pybind/pybind.h"  // NOLINT
-#include "paddle/fluid/pybind/reader_py.h"
-#include "paddle/fluid/pybind/tensor_py.h"
-#include "paddle/utils/string/to_string.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
-#endif
-#ifndef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h"
-#endif
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#endif
-
-#ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/platform/device/xpu/xpu_info.h"
-#include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
-#endif
-
-#ifdef PADDLE_WITH_CUSTOM_DEVICE
-#include "paddle/phi/capi/capi.h"
-#endif
-
-#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
-
-#ifdef PADDLE_WITH_IPU
-#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
-#include "paddle/fluid/platform/device/ipu/ipu_info.h"
-#endif
-
-#ifdef PADDLE_WITH_CRYPTO
-#include "paddle/fluid/pybind/crypto.h"
-#endif
-
-#if defined PADDLE_WITH_PSCORE
-#include "paddle/fluid/pybind/fleet_py.h"
-#endif
-
-#ifdef PADDLE_WITH_CINN
-#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
-#endif
-
-#include "paddle/common/flags.h"
-#include "paddle/fluid/eager/api/utils/global_utils.h"
-#include "paddle/fluid/imperative/layout_autotune.h"
-#include "paddle/fluid/pybind/eager_utils.h"
-#include "paddle/fluid/pybind/parallel_executor.h"
-#include "paddle/phi/api/ext/op_meta_info.h"
-#include "paddle/phi/kernels/autotune/cache.h"
-#include "paddle/phi/kernels/autotune/switch_autotune.h"
-#include "pybind11/stl.h"
-
-COMMON_DECLARE_bool(use_mkldnn);
-
-// disable auto conversion to list in Python
-PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
-PYBIND11_MAKE_OPAQUE(paddle::framework::FetchUnmergedList);
-PYBIND11_MAKE_OPAQUE(paddle::framework::FetchList);
-PYBIND11_MAKE_OPAQUE(paddle::framework::FetchType);
-
-namespace paddle {
-namespace pybind {
-using namespace paddle::framework;                // NOLINT
-void BindParallelExecutor(pybind11::module &m) {  // NOLINT
-  // -- python binds for parallel executor.
-  py::class_<ParallelExecutor> pe(m, "ParallelExecutor");
-  py::class_<ExecutionStrategy> exec_strategy(pe, "ExecutionStrategy");
-
-  py::enum_<paddle::platform::DeviceType>(m, "DeviceType", py::arithmetic())
-      .value("CPU", paddle::platform::DeviceType::CPU)
-      .value("CUDA", paddle::platform::DeviceType::CUDA)
-      .value("XPU", paddle::platform::DeviceType::XPU);
-
-  exec_strategy.def(py::init())
-      .def_property(
-          "num_threads",
-          [](const ExecutionStrategy &self) { return self.num_threads_; },
-          [](ExecutionStrategy &self, size_t num_threads) {
-            self.num_threads_ = num_threads;
-          })
-      .def_property(
-          "_use_device",
-          [](const ExecutionStrategy &self) { return self.use_device_; },
-          [](ExecutionStrategy &self, paddle::platform::DeviceType use_device) {
-            self.use_device_ = use_device;
-          })  // NOTE(liuyuhui): Doesn't add doc for 'use_device', because
-              // use_device isn‘t exposed to users.
-      .def_property(
-          "allow_op_delay",
-          [](const ExecutionStrategy &self) { return self.allow_op_delay_; },
-          [](ExecutionStrategy &self, bool allow_op_delay) {
-            self.allow_op_delay_ = allow_op_delay;
-          })
-      .def_property(
-          "num_iteration_per_drop_scope",
-          [](const ExecutionStrategy &self) {
-            return self.num_iteration_per_drop_scope_;
-          },
-          [](ExecutionStrategy &self, size_t num_iteration_per_drop_scope) {
-            self.num_iteration_per_drop_scope_ = num_iteration_per_drop_scope;
-          })
-      .def_property(
-          "num_iteration_per_run",
-          [](const ExecutionStrategy &self) {
-            return self.num_iteration_per_run_;
-          },
-          [](ExecutionStrategy &self, size_t num_iteration_per_run) {
-            self.num_iteration_per_run_ = num_iteration_per_run;
-          })
-      .def_property(
-          "use_thread_barrier",
-          [](const ExecutionStrategy &self) { return self.thread_barrier_; },
-          [](ExecutionStrategy &self, bool use_thread_barrier) {
-            self.thread_barrier_ = use_thread_barrier;
-          })
-      .def_property(
-          "_dry_run",
-          [](const ExecutionStrategy &self) { return self.dry_run_; },
-          [](ExecutionStrategy &self, bool dry_run) {
-            self.dry_run_ = dry_run;
-          });
-
-  exec_strategy.def_property(
-      "use_experimental_executor",
-      [](const ExecutionStrategy &self) {
-        return self.type_ == ExecutionStrategy::kExperimental;
-      },
-      [](ExecutionStrategy &self, bool experimental) {
-        self.type_ = experimental ? ExecutionStrategy::kExperimental
-                                  : ExecutionStrategy::kDefault;
-      });
-
-  py::class_<BuildStrategy> build_strategy(pe, "BuildStrategy", R"DOC(
-    BuildStrategy allows the user to more preciously control how to
-    build the SSA Graph in ParallelExecutor by setting the property.
-
-    Returns:
-        BuildStrategy: An BuildStrategy object.
-
-    Examples:
-        .. code-block:: python
-
-            >>> import paddle
-            >>> import paddle.static as static
-
-            >>> paddle.enable_static()
-
-            >>> data = static.data(name="x", shape=[None, 1], dtype="float32")
-            >>> hidden = static.nn.fc(data, size=10)
-            >>> loss = paddle.mean(hidden)
-            >>> paddle.optimizer.SGD(learning_rate=0.01).minimize(loss)
-
-            >>> build_strategy = static.BuildStrategy()
-            >>> build_strategy.enable_inplace = True
-            >>> build_strategy.memory_optimize = True
-            >>> build_strategy.reduce_strategy = static.BuildStrategy.ReduceStrategy.Reduce
-            >>> program = static.CompiledProgram(static.default_main_program(), build_strategy=build_strategy)
-)DOC");
-
-  py::enum_<BuildStrategy::ReduceStrategy>(build_strategy, "ReduceStrategy")
-      .value("Reduce", BuildStrategy::ReduceStrategy::kReduce)
-      .value("AllReduce", BuildStrategy::ReduceStrategy::kAllReduce)
-      .value("_NoReduce", BuildStrategy::ReduceStrategy::kNoReduce);
-  py::enum_<BuildStrategy::GradientScaleStrategy>(build_strategy,
-                                                  "GradientScaleStrategy")
-      .value("CoeffNumDevice",
-             BuildStrategy::GradientScaleStrategy::kCoeffNumDevice)
-      .value("One", BuildStrategy::GradientScaleStrategy::kOne)
-      .value("Customized", BuildStrategy::GradientScaleStrategy::kCustomized);
-
-  build_strategy.def(py::init())
-      .def("_clear_finalized", &BuildStrategy::ClearFinalized)
-      .def_property(
-          "reduce_strategy",
-          [](const BuildStrategy &self) { return self.reduce_; },
-          [](BuildStrategy &self, BuildStrategy::ReduceStrategy strategy) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finalized, cannot be "
-                                  "configured again."));
-            self.reduce_ = strategy;
-          },
-          R"DOC((fluid.BuildStrategy.ReduceStrategy, optional): there are two reduce
-                strategies in ParallelExecutor, AllReduce and Reduce. If you want
-                that all the parameters' optimization are done on all devices independently,
-                you should choose AllReduce; otherwise, if you choose Reduce, all the parameters'
-                optimization will be evenly distributed to different devices, and then
-                broadcast the optimized parameter to other devices.
-                Default is 'AllReduce'.
-
-                Examples:
-                    .. code-block:: python
-
-                        >>> import paddle
-                        >>> import paddle.static as static
-
-                        >>> paddle.enable_static()
-
-                        >>> build_strategy = static.BuildStrategy()
-                        >>> build_strategy.reduce_strategy = static.BuildStrategy.ReduceStrategy.Reduce
-          )DOC")
-      .def_property(
-          "gradient_scale_strategy",
-          [](const BuildStrategy &self) { return self.gradient_scale_; },
-          [](BuildStrategy &self,
-             BuildStrategy::GradientScaleStrategy strategy) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finalized, cannot be "
-                                  "configured again."));
-            self.gradient_scale_ = strategy;
-          },
-          R"DOC((paddle.static.BuildStrategy.GradientScaleStrategy, optional): there are three
-                ways of defining :math:`loss@grad` in ParallelExecutor, that is, CoeffNumDevice,
-                One and Customized. By default, ParallelExecutor sets the :math:`loss@grad`
-                according to the number of devices. If you want to customize :math:`loss@grad`,
-                you can choose Customized. Default is 'CoeffNumDevice'.
-
-                Examples:
-                    .. code-block:: python
-
-                        >>> import numpy
-                        >>> import paddle
-                        >>> import paddle.static as static
-
-                        >>> paddle.enable_static()
-
-                        >>> use_cuda = paddle.device.is_compiled_with_cuda
-                        >>> place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
-                        >>> exe = static.Executor(place)
-
-                        >>> data = static.data(name='X', shape=[None, 1], dtype='float32')
-                        >>> hidden = static.nn.fc(data, size=10)
-                        >>> loss = paddle.mean(hidden)
-                        >>> paddle.optimizer.SGD(learning_rate=0.01).minimize(loss)
-
-                        >>> exe.run(static.default_startup_program())
-
-                        >>> build_strategy = static.BuildStrategy()
-                        >>> build_strategy.gradient_scale_strategy = \
-                        ...             static.BuildStrategy.GradientScaleStrategy.Customized
-                        >>> compiled_prog = static.CompiledProgram(
-                        ...             static.default_main_program(),
-                        ...             build_strategy=build_strategy,
-                        >>> )
-
-                        >>> x = numpy.random.random(size=(10, 1)).astype('float32')
-                        >>> loss_grad = numpy.ones((1)).astype("float32") * 0.01
-                        >>> loss_grad_name = loss.name+"@GRAD"
-                        >>> loss_data = exe.run(compiled_prog,
-                        ...                         feed={"X": x, loss_grad_name : loss_grad},
-                        ...                         fetch_list=[loss.name, loss_grad_name])
-          )DOC")
-      .def_property(
-          "debug_graphviz_path",
-          [](const BuildStrategy &self) { return self.debug_graphviz_path_; },
-          [](BuildStrategy &self, const std::string &path) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finalized, cannot be "
-                                  "configured again."));
-            self.debug_graphviz_path_ = path;
-          },
-          R"DOC((str, optional): debug_graphviz_path indicates the path that
-                writing the SSA Graph to file in the form of graphviz.
-                It is useful for debugging. Default is empty string, that is, ""
-
-                Examples:
-                    .. code-block:: python
-
-                        >>> import paddle
-                        >>> import paddle.static as static
-
-                        >>> paddle.enable_static()
-
-                        >>> build_strategy = static.BuildStrategy()
-                        >>> build_strategy.debug_graphviz_path = "./graph"
-          )DOC")
-      .def_property(
-          "enable_sequential_execution",
-          [](const BuildStrategy &self) {
-            return self.enable_sequential_execution_;
-          },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finalized, cannot be "
-                                  "configured again."));
-            self.enable_sequential_execution_ = b;
-          },
-          R"DOC((bool, optional): If set True, the execution order of ops would
-                be the same as what is in the program. Default is False.
-
-                Examples:
-                    .. code-block:: python
-
-                        >>> import paddle
-                        >>> import paddle.static as static
-
-                        >>> paddle.enable_static()
-
-                        >>> build_strategy = static.BuildStrategy()
-                        >>> build_strategy.enable_sequential_execution = True
-          )DOC")
-      .def_property(
-          "remove_unnecessary_lock",
-          [](const BuildStrategy &self) {
-            return self.remove_unnecessary_lock_;
-          },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finalized, cannot be "
-                                  "configured again."));
-            self.remove_unnecessary_lock_ = b;
-          },
-          R"DOC((bool, optional): If set True, some locks in GPU ops would be
-                released and ParallelExecutor would run faster. Default is True.
-
-                Examples:
-                    .. code-block:: python
-
-                        >>> import paddle
-                        >>> import paddle.static as static
-
-                        >>> paddle.enable_static()
-
-                        >>> build_strategy = static.BuildStrategy()
-                        >>> build_strategy.remove_unnecessary_lock = True
-          )DOC")
-      .def_property(
-          "num_trainers",
-          [](const BuildStrategy &self) { return self.num_trainers_; },
-          [](BuildStrategy &self, int num_trainers) {
-#ifdef WIN32
-            PADDLE_THROW(platform::errors::Unavailable(
-                "Distribution mode is not supported on Windows platform."));
-#endif
-            self.num_trainers_ = num_trainers;
-          })
-      .def_property(
-          "trainers_endpoints",
-          [](const BuildStrategy &self) { return self.trainers_endpoints_; },
-          [](BuildStrategy &self,
-             const std::vector<std::string> &trainers_endpoints) {
-            self.trainers_endpoints_ = trainers_endpoints;
-          })
-      .def_property(
-          "trainer_id",
-          [](const BuildStrategy &self) { return self.trainer_id_; },
-          [](BuildStrategy &self, int trainer_id) {
-            self.trainer_id_ = trainer_id;
-          })
-      .def_property(
-          "nccl_comm_num",
-          [](const BuildStrategy &self) { return self.nccl_comm_num_; },
-          [](BuildStrategy &self, int nccl_comm_num) {
-            self.nccl_comm_num_ = nccl_comm_num;
-          })
-      .def_property(
-          "bkcl_comm_num",
-          [](const BuildStrategy &self) { return self.bkcl_comm_num_; },
-          [](BuildStrategy &self, int bkcl_comm_num) {
-            self.bkcl_comm_num_ = bkcl_comm_num;
-          })
-      .def_property(
-          "use_hierarchical_allreduce",
-          [](const BuildStrategy &self) {
-            return self.use_hierarchical_allreduce_;
-          },
-          [](BuildStrategy &self, bool use) {
-            self.use_hierarchical_allreduce_ = use;
-          })
-      .def_property(
-          "hierarchical_allreduce_inter_nranks",
-          [](const BuildStrategy &self) {
-            return self.hierarchical_allreduce_inter_nranks_;
-          },
-          [](BuildStrategy &self, int nranks) {
-            self.hierarchical_allreduce_inter_nranks_ = nranks;
-          })
-      .def_property(
-          "build_cinn_pass",
-          [](const BuildStrategy &self) { return self.build_cinn_pass_; },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finalized, "
-                                  "cannot be configured again."));
-            self.build_cinn_pass_ = b;
-          },
-          R"DOC((bool, optional): build_cinn_pass indicates whether
-                      to lowering some operators in graph into cinn ops
-                      to execute, which will speed up the process of execution.
-                      Default False.
-
-                      Examples:
-                            .. code-block:: python
-
-                                >>> import paddle
-                                >>> import paddle.static as static
-                                >>> paddle.enable_static()
-                                >>> build_strategy = static.BuildStrategy()
-                                >>> build_strategy.build_cinn_pass = True
-          )DOC")
-      .def_property(
-          "fuse_elewise_add_act_ops",
-          [](const BuildStrategy &self) {
-            return self.fuse_elewise_add_act_ops_;
-          },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finalized, cannot be "
-                                  "configured again."));
-            self.fuse_elewise_add_act_ops_ = b;
-          },
-          R"DOC((bool, optional): fuse_elewise_add_act_ops indicate whether
-                to fuse elementwise_add_op and activation_op,
-                it may make the execution faster. Default is False.
-
-                Examples:
-                    .. code-block:: python
-
-                        >>> import paddle
-                        >>> import paddle.static as static
-
-                        >>> paddle.enable_static()
-
-                        >>> build_strategy = static.BuildStrategy()
-                        >>> build_strategy.fuse_elewise_add_act_ops = True
-          )DOC")
-      .def_property(
-          "fuse_gemm_epilogue",
-          [](const BuildStrategy &self) { return self.fuse_gemm_epilogue_; },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finalized, cannot be "
-                                  "configured again."));
-            self.fuse_gemm_epilogue_ = b;
-          },
-          R"DOC((bool, optional): fuse_gemm_epilogue indicate whether
-                to fuse matmul_op, elemenewist_add_op and activation_op,
-                it may make the execution faster. Default is False.
-
-                Examples:
-                    .. code-block:: python
-
-                        >>> import paddle
-                        >>> import paddle.static as static
-
-                        >>> paddle.enable_static()
-
-                        >>> build_strategy = static.BuildStrategy()
-                        >>> build_strategy.fuse_gemm_epilogue = True
-          )DOC")
-      .def_property(
-          "fuse_dot_product_attention",
-          [](const BuildStrategy &self) {
-            return self.fuse_dot_product_attention_;
-          },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finlaized, cannot be "
-                                  "configured again."));
-            self.fuse_dot_product_attention_ = b;
-          },
-          R"DOC((bool, optional): fuse_dot_product_attention indicate whether
-                to fuse dot product attention,
-                it would make the execution faster. Default is False.
-
-                Examples:
-                    .. code-block:: python
-
-                        import paddle
-                        import paddle.static as static
-
-                        paddle.enable_static()
-
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.fuse_dot_product_attention = True
-                     )DOC")
-      .def_property(
-          "fuse_adamw",
-          [](const BuildStrategy &self) { return self.fuse_adamw_; },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finalized, cannot be "
-                                  "configured again."));
-            self.fuse_adamw_ = b;
-          },
-          R"DOC((bool, optional): fuse_adamw indicate whether
-                to fuse all adamw optimizers with multi_tensor_adam,
-                it may make the execution faster. Default is False.
-                Examples:
-                    .. code-block:: python
-
-                        >>> import paddle
-                        >>> import paddle.static as static
-                        >>> paddle.enable_static()
-                        >>> build_strategy = static.BuildStrategy()
-                        >>> build_strategy.fuse_adamw = True
-          )DOC")
-      .def_property(
-          "fused_attention",
-          [](const BuildStrategy &self) { return self.fused_attention_; },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finalized, cannot be "
-                                  "configured again."));
-            self.fused_attention_ = b;
-          },
-          R"DOC((bool, optional): fused_attention indicate whether
-                to fuse the whole multi head attention part with one op,
-                it may make the execution faster. Default is False.
-
-                Examples:
-                    .. code-block:: python
-
-                        >>> import paddle
-                        >>> import paddle.static as static
-
-                        >>> paddle.enable_static()
-
-                        >>> build_strategy = static.BuildStrategy()
-                        >>> build_strategy.fused_attention = True
-          )DOC")
-      .def_property(
-          "fused_feedforward",
-          [](const BuildStrategy &self) { return self.fused_feedforward_; },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finalized, cannot be "
-                                  "configured again."));
-            self.fused_feedforward_ = b;
-          },
-          R"DOC((bool, optional): fused_feedforward indicate whether
-                to fuse the whole feed_forward part with one op,
-                it may make the execution faster. Default is False.
-
-                Examples:
-                    .. code-block:: python
-
-                        >>> import paddle
-                        >>> import paddle.static as static
-
-                        >>> paddle.enable_static()
-
-                        >>> build_strategy = static.BuildStrategy()
-                        >>> build_strategy.fused_feedforward = True
-          )DOC")
-      .def_property(
-          "sequential_run",
-          [](const BuildStrategy &self) { return self.sequential_run_; },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finalized, cannot be "
-                                  "configured again."));
-            self.sequential_run_ = b;
-          },
-          R"DOC((bool, optional): sequential_run is used to let the `StandaloneExecutor` run ops by the
-          order of `ProgramDesc`. Default is False.
-
-                Examples:
-                    .. code-block:: python
-
-                        >>> import paddle
-                        >>> import paddle.static as static
-
-                        >>> paddle.enable_static()
-
-                        >>> build_strategy = static.BuildStrategy()
-                        >>> build_strategy.sequential_run = True
-          )DOC")
-      .def_property(
-          "fuse_resunit",
-          [](const BuildStrategy &self) { return self.fuse_resunit_; },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finalized, cannot be "
-                                  "configured again."));
-            self.fuse_resunit_ = b;
-#ifndef PADDLE_WITH_CUDNN_FRONTEND
-            if (self.fuse_resunit_) {
-              PADDLE_THROW(platform::errors::PreconditionNotMet(
-                  "Paddle is not built with CUDNN Frontend support."));
-            }
-#endif
-          },
-          R"DOC((bool, optional): fuse_resunit Default is False.
-
-                Examples:
-                    .. code-block:: python
-
-                        import paddle
-                        import paddle.static as static
-
-                        paddle.enable_static()
-
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.fuse_resunit = True
-                     )DOC")
-      .def_property(
-          "fuse_bn_act_ops",
-          [](const BuildStrategy &self) { return self.fuse_bn_act_ops_; },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finalized, cannot be "
-                                  "configured again."));
-            self.fuse_bn_act_ops_ = b;
-          },
-          R"DOC((bool, optional): fuse_bn_act_ops indicate whether
-                to fuse batch_norm and activation_op,
-                it may make the execution faster. Default is False.
-
-                Examples:
-                    .. code-block:: python
-
-                        >>> import paddle
-                        >>> import paddle.static as static
-
-                        >>> paddle.enable_static()
-
-                        >>> build_strategy = static.BuildStrategy()
-                        >>> build_strategy.fuse_bn_act_ops = True
-          )DOC")
-      .def_property(
-          "fuse_bn_add_act_ops",
-          [](const BuildStrategy &self) { return self.fuse_bn_add_act_ops_; },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finalized, cannot be "
-                                  "configured again."));
-            self.fuse_bn_add_act_ops_ = b;
-          },
-          R"DOC((bool, optional): fuse_bn_add_act_ops indicate whether
-                to fuse batch_norm, elementwise_add and activation_op,
-                it may make the execution faster. Default is True
-
-                Examples:
-                    .. code-block:: python
-
-                        >>> import paddle
-                        >>> import paddle.static as static
-
-                        >>> paddle.enable_static()
-
-                        >>> build_strategy = static.BuildStrategy()
-                        >>> build_strategy.fuse_bn_add_act_ops = True
-          )DOC")
-      .def_property(
-          "enable_auto_fusion",
-          [](const BuildStrategy &self) { return self.enable_auto_fusion_; },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finalized, cannot be "
-                                  "configured again."));
-            self.enable_auto_fusion_ = b;
-          },
-          R"DOC((bool, optional): Whether to enable fusing subgraph to a
-                fusion_group. Now we only support fusing subgraph that composed
-                of elementwise-like operators, such as elementwise_add/mul
-                without broadcast and activations.
-
-                Examples:
-                    .. code-block:: python
-
-                        >>> import paddle
-                        >>> import paddle.static as static
-
-                        >>> paddle.enable_static()
-
-                        >>> build_strategy = static.BuildStrategy()
-                        >>> build_strategy.enable_auto_fusion = True
-          )DOC")
-      .def_property(
-          "fuse_relu_depthwise_conv",
-          [](const BuildStrategy &self) {
-            return self.fuse_relu_depthwise_conv_;
-          },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finalized, cannot be "
-                                  "configured again."));
-            self.fuse_relu_depthwise_conv_ = b;
-          },
-          R"DOC((bool, optional): fuse_relu_depthwise_conv indicate whether
-                to fuse relu and depthwise_conv2d,
-                it will save GPU memory and may make the execution faster.
-                This options is only available in GPU devices.
-                Default is False.
-
-                Examples:
-                    .. code-block:: python
-
-                        >>> import paddle
-                        >>> import paddle.static as static
-
-                        >>> paddle.enable_static()
-
-                        >>> build_strategy = static.BuildStrategy()
-                        >>> build_strategy.fuse_relu_depthwise_conv = True
-          )DOC")
-      .def_property(
-          "fuse_broadcast_ops",
-          [](const BuildStrategy &self) {
-            return self.fuse_broadcast_ops_ == true ||
-                   self.fuse_broadcast_ops_ == paddle::none;
-          },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finalized, "
-                                  "cannot be configured again."));
-            self.fuse_broadcast_ops_ = b;
-          },
-          R"DOC((bool, optional): fuse_broadcast_op indicates whether
-                      to fuse the broadcast ops. Note that, in Reduce mode,
-                      fusing broadcast ops may make the program faster. Because
-                      fusing broadcast OP equals delaying the execution of all
-                      broadcast Ops, in this case, all nccl streams are used only
-                      for NCCLReduce operations for a period of time. Default False.
-
-                      Examples:
-                            .. code-block:: python
-
-                                >>> import paddle
-                                >>> import paddle.static as static
-                                >>> paddle.enable_static()
-
-                                >>> build_strategy = static.BuildStrategy()
-                                >>> build_strategy.fuse_broadcast_ops = True
-          )DOC")
-      .def_property(
-          "fuse_all_optimizer_ops",
-          [](const BuildStrategy &self) {
-            return self.fuse_all_optimizer_ops_ == true ||
-                   self.fuse_all_optimizer_ops_ == paddle::none;
-          },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finalized, "
-                                  "cannot be configured again."));
-            self.fuse_all_optimizer_ops_ = b;
-          })
-      .def_property(
-          "sync_batch_norm",
-          [](const BuildStrategy &self) { return self.sync_batch_norm_; },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finalized, cannot be "
-                                  "configured again."));
-            self.sync_batch_norm_ = b;
-          },
-          R"DOC((bool, optional): sync_batch_norm indicates whether to use
-                synchronous batch normalization which synchronizes the mean
-                and variance through multi-devices in training phase.
-                Current implementation doesn't support FP16 training and CPU.
-                And only synchronous on one machine, not all machines.
-                Default is False.
-
-                Examples:
-                    .. code-block:: python
-
-                        >>> import paddle
-                        >>> import paddle.static as static
-
-                        >>> paddle.enable_static()
-
-                        >>> build_strategy = static.BuildStrategy()
-                        >>> build_strategy.sync_batch_norm = True
-          )DOC")
-      .def_property(
-          "memory_optimize",
-          [](const BuildStrategy &self) -> py::object {
-            if (self.memory_optimize_) {  // NOLINT
-              return py::cast(self.memory_optimize_.get());
-            } else {
-              return py::cast(nullptr);
-            }
-          },
-          [](BuildStrategy &self, const py::handle &value) {
-            auto *py_obj = value.ptr();
-            if (py_obj == nullptr || py_obj == Py_None) {
-              self.memory_optimize_ = paddle::none;
-            } else if (PyBool_Check(py_obj)) {
-              self.memory_optimize_ = (py_obj == Py_True);
-            } else {
-              PADDLE_THROW(platform::errors::InvalidArgument(
-                  "BuildStrategy.memory_optimize must be set to None, False "
-                  "or True"));
-            }
-          },
-          R"DOC((bool, optional): memory opitimize aims to save total memory
-                consumption, set to True to enable it.
-
-                Default None. None means framework would choose to use or not use
-                this strategy automatically. Currently, None means that it is
-                enabled when GC is disabled, and disabled when GC is enabled.
-                True means enabling and False means disabling. Default is None.
-
-                Examples:
-                    .. code-block:: python
-
-                        >>> import paddle
-                        >>> import paddle.static as static
-
-                        >>> paddle.enable_static()
-
-                        >>> build_strategy = static.BuildStrategy()
-                        >>> build_strategy.memory_optimize = True
-
-          )DOC")
-      .def_property(
-          "is_distribution",
-          [](const BuildStrategy &self) { return self.is_distribution_; },
-          [](BuildStrategy &self, bool b) {
-#ifdef WIN32
-            if (b) {
-              PADDLE_THROW(platform::errors::Unavailable(
-                  "Distribution mode is not supported on Windows platform."));
-            }
-#else
-            self.is_distribution_ = b;
-#endif
-          })
-      .def_property(
-          "async_mode",
-          [](const BuildStrategy &self) { return self.async_mode_; },
-          [](BuildStrategy &self, bool b) { self.async_mode_ = b; })
-      .def_property(
-          "enable_inplace",
-          [](const BuildStrategy &self) { return self.enable_inplace_; },
-          [](BuildStrategy &self, bool b) { self.enable_inplace_ = b; })
-      .def_property(
-          "enable_addto",
-          [](const BuildStrategy &self) { return self.enable_addto_; },
-          [](BuildStrategy &self, bool b) { self.enable_addto_ = b; })
-      .def_property(
-          "fuse_all_reduce_ops",
-          [](const BuildStrategy &self) {
-            return self.fuse_all_reduce_ops_ == true ||
-                   self.fuse_all_reduce_ops_ == paddle::none;
-          },
-          [](BuildStrategy &self, bool b) { self.fuse_all_reduce_ops_ = b; })
-      .def_property(
-          "enable_backward_optimizer_op_deps",
-          [](const BuildStrategy &self) {
-            return self.enable_backward_optimizer_op_deps_;
-          },
-          [](BuildStrategy &self, bool b) {
-            self.enable_backward_optimizer_op_deps_ = b;
-          })
-      .def_property(
-          "cache_runtime_context",
-          [](const BuildStrategy &self) { return self.cache_runtime_context_; },
-          [](BuildStrategy &self, bool b) { self.cache_runtime_context_ = b; })
-      .def_property(
-          "mkldnn_enabled_op_types",
-          [](const BuildStrategy &self) {
-            return self.mkldnn_enabled_op_types_;
-          },
-          [](BuildStrategy &self,
-             const std::unordered_set<std::string> &mkldnn_enabled_op_types) {
-            self.mkldnn_enabled_op_types_ = mkldnn_enabled_op_types;
-          })
-      .def_property(
-          "fix_op_run_order",
-          [](const BuildStrategy &self) { return self.fix_op_run_order_; },
-          [](BuildStrategy &self, bool fix_op_run_order) {
-            self.fix_op_run_order_ = fix_op_run_order;
-          })
-      .def_property(
-          "allow_cuda_graph_capture",
-          [](const BuildStrategy &self) {
-            return self.allow_cuda_graph_capture_;
-          },
-          [](BuildStrategy &self, bool allow_cuda_graph_capture) {
-            self.allow_cuda_graph_capture_ = allow_cuda_graph_capture;
-          })
-      .def("_copy",
-           [](const BuildStrategy &self) {
-             auto new_bs = self;
-             new_bs.ClearFinalized();
-             return new_bs;
-           })
-      .def("__str__",
-           [](const BuildStrategy &self) {
-             std::stringstream ss;
-             ss << self;
-             return ss.str();
-           })
-      .def(
-          "_finalize_strategy_and_create_passes",
-          [](BuildStrategy &self) -> std::shared_ptr<ir::PassBuilder> {
-            return self.CreatePassesFromStrategy(true);
-          },
-          R"DOC(Allow user to customized passes. Normally model-specific
-                optimization passes should be defined in this way. BuildStrategy
-                cannot be updated after being finalized.)DOC");
-
-  m.def("_set_cached_executor_build_strategy",
-        [](int64_t program_id, const BuildStrategy &build_strategy) {
-          auto &cached_exe_info = framework::ExecutorInfoCache::Instance();
-          cached_exe_info.SetBuildStrategy(program_id, build_strategy);
-        });
-
-  pe.def(py::init<const std::vector<platform::Place> &,
-                  const std::vector<std::string> &,
-                  const std::string &,
-                  Scope *,
-                  std::vector<Scope *> &,
-                  const ExecutionStrategy &,
-                  const BuildStrategy &,
-                  ir::Graph *>())
-      // NOTE: even we return a vec<Scope*>* to Python use reference policy.
-      // We still cannot get local_scope from this vector, since the element
-      // of vec<Scope*> will be freed by Python GC. We can only return Scope*
-      // one by one and mark them as reference.
-      .def(
-          "local_scopes",
-          [](ParallelExecutor &self) -> std::vector<Scope *> * {
-            return &self.GetLocalScopes();
-          },
-          py::return_value_policy::reference)
-      .def("drop_local_exe_scopes", &ParallelExecutor::DropLocalExeScopes)
-      .def("_need_create_local_exe_scopes",
-           &ParallelExecutor::NeedCreateLocalExeScope)
-      .def("feed_tensors_into_local_scopes",
-           &ParallelExecutor::FeedTensorsIntoLocalScopes)
-      .def("feed_and_split_tensor_into_local_scopes",
-           &ParallelExecutor::FeedAndSplitTensorIntoLocalScopes)
-      .def("run",
-           [](ParallelExecutor &self,
-              const std::vector<std::string> &fetch_tensors,
-              bool return_merged) -> py::object {
-             if (return_merged) {
-               paddle::framework::FetchList ret;
-               /*gil_scoped_release*/ {
-                 pybind11::gil_scoped_release release;
-                 ret = self.RunAndMerge(fetch_tensors);
-               }
-               return py::cast(std::move(ret));
-             } else {
-               paddle::framework::FetchUnmergedList ret;
-               /*gil_scoped_release*/ {
-                 pybind11::gil_scoped_release release;
-                 ret = self.Run(fetch_tensors);
-               }
-               return py::cast(std::move(ret));
-             }
-           })
-      .def("device_count", &ParallelExecutor::DeviceCount);
-  using VarQuantScale =
-      std::unordered_map<std::string, std::pair<bool, phi::DenseTensor>>;
-  py::class_<ir::Pass, std::shared_ptr<ir::Pass>> pass(m, "Pass");
-  pass.def(py::init())
-      .def("has", &ir::Pass::Has)
-      .def("set_not_owned",
-           [](ir::Pass &self, const std::string &attr_name, ProgramDesc &attr) {
-             self.SetNotOwned<ProgramDesc>(attr_name, &attr);
-           })
-      .def(
-          "set",
-          [](ir::Pass &self, const std::string &name, const std::string &attr) {
-            self.Set<std::string>(name, new std::string(attr));
-          })
-      .def("set",
-           [](ir::Pass &self, const std::string &name, bool val) {
-             self.Set<bool>(name, new bool(val));
-           })
-      .def("set",
-           [](ir::Pass &self, const std::string &name, int val) {
-             self.Set<const int>(name, new int(val));
-           })
-      .def("set",
-           [](ir::Pass &self,
-              const std::string &name,
-              std::vector<std::string> set) {
-             self.Set(name, new std::vector<std::string>(set));
-           })
-      .def("set",
-           [](ir::Pass &self,
-              const std::string &name,
-              std::unordered_set<std::string> set) {
-             self.Set(name, new std::unordered_set<std::string>(set));
-           })
-      .def("set",
-           [](ir::Pass &self,
-              const std::string &name,
-              std::unordered_set<int> set) {
-             self.Set(name, new std::unordered_set<int>(set));
-           })
-      .def("set",
-           [](ir::Pass &self, const std::string &name, VarQuantScale scales) {
-             self.Set(name, new VarQuantScale(scales));
-           })
-      .def("type", &ir::Pass::Type)
-      .def("apply", [](ir::Pass &self, std::shared_ptr<ir::Graph> graph) {
-        self.Apply(graph.get());
-      });
-
-  py::class_<ir::PassBuilder, std::shared_ptr<ir::PassBuilder>> pb(
-      m, "PassBuilder");
-  pb.def(py::init())
-      .def("append_pass",
-           [](ir::PassBuilder &self,
-              const std::string &pass_type) -> std::shared_ptr<ir::Pass> {
-             return self.AppendPass(pass_type);
-           })
-      .def("all_passes", [](ir::PassBuilder &self) { return self.AllPasses(); })
-      .def("insert_pass",
-           [](ir::PassBuilder &self, size_t idx, const std::string &pass_type) {
-             return self.InsertPass(idx, pass_type);
-           })
-      .def("remove_pass",
-           [](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); });
-}
-
-}  // namespace pybind
-}  // namespace paddle
diff --git a/paddle/fluid/pybind/pir.cc b/paddle/fluid/pybind/pir.cc
index 29c5c764c9753..e8efeb54f16b0 100644
--- a/paddle/fluid/pybind/pir.cc
+++ b/paddle/fluid/pybind/pir.cc
@@ -79,10 +79,10 @@
 #include "paddle/cinn/hlir/dialect/operator/ir/op_dialect.h"
 #include "paddle/cinn/hlir/dialect/operator/transforms/add_cinn_pass.h"
 #include "paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_util.h"
+#include "paddle/cinn/hlir/dialect/operator/transforms/pir_to_py_code_converter.h"
 #include "paddle/cinn/hlir/framework/pir_compiler.h"
 #endif
 
-namespace py = pybind11;
 using paddle::dialect::ApiBuilder;
 using paddle::dialect::DenseTensorArrayType;
 using paddle::dialect::DenseTensorType;
@@ -116,10 +116,12 @@ using pir::Program;
 using pir::StrAttribute;
 using pir::Type;
 using pir::Value;
+using pir::VectorType;
 using pybind11::return_value_policy;
 
 COMMON_DECLARE_bool(print_ir);
 COMMON_DECLARE_bool(pir_apply_shape_optimization_pass);
+COMMON_DECLARE_bool(logging_pir_py_code_dump_symbolic_dims);
 
 namespace paddle {
 namespace pybind {
@@ -412,6 +414,12 @@ void BindProgram(py::module *m) {
            [](Program &self, IrMapping &ir_mapper) {
              return Clone(self, &ir_mapper);
            })
+      .def(
+          "copy_to_block",
+          [](std::shared_ptr<Program> self,
+             pir::IrMapping &mapper,
+             Block *block) { return self->CopyToBlock(mapper, block); },
+          return_value_policy::reference)
       .def(
           "list_vars",
           [](std::shared_ptr<Program> self) {
@@ -449,6 +457,17 @@ void BindProgram(py::module *m) {
              global_prog_seed = random_seed;
              SetProgramInt64Attr(self, "random_seed", random_seed);
            })
+      .def_property_readonly(
+          "num_blocks",
+          [](const std::shared_ptr<Program> &self) {
+            size_t num_blocks = 0;
+            auto top_level_op = self->module_op();
+            for (size_t i = 0; i < top_level_op->num_regions(); ++i) {
+              auto &region = top_level_op->region(i);
+              num_blocks += region.size();
+            }
+            return num_blocks;
+          })
       .def_property_readonly(
           "blocks",
           [](const std::shared_ptr<Program> &self) {
@@ -654,9 +673,12 @@ void BindIrMapping(py::module *m) {
   ir_mapping.def(py::init<>())
       .def("look_up",
            [](IrMapping &self, Value from) { return self.Lookup(from); })
-      .def("add", [](IrMapping &self, Value from, Value to) {
-        self.Add<Value>(from, to);
-      });
+      .def("add",
+           [](IrMapping &self, Value from, Value to) {
+             self.Add<Value>(from, to);
+           })
+      .def("size",
+           [](IrMapping &self) { return self.GetMutableMap<Value>().size(); });
 }
 
 void BindCloneOptions(py::module *m) {
@@ -1321,6 +1343,13 @@ void BindType(py::module *m) {
             PADDLE_THROW(phi::errors::InvalidArgument(
                 "can't set _local_shape when building static graph"));
           })
+      .def("as_vec_type",
+           [](Type self) -> py::object {
+             if (auto vec_type = self.dyn_cast<VectorType>()) {
+               return py::cast(vec_type);
+             }
+             return py::cast<py::none>(Py_None);
+           })
       .def("__str__", [](Type &self) {
         std::ostringstream print_stream;
         print_stream << self;
@@ -1355,7 +1384,13 @@ void BindType(py::module *m) {
            }
          });
 }
-
+void BindVectorType(py::module *m) {
+  py::class_<VectorType, Type> vec_type(*m, "VectorType");
+  vec_type.def("as_list", &VectorType::data);
+  m->def("create_vec_type", [](std::vector<Type> &types) {
+    return VectorType::get(pir::IrContext::Instance(), types);
+  });
+}
 void BindAttribute(py::module *m) {
   py::class_<Attribute> ir_attr(*m, "Attribute", py::module_local());
   ir_attr.def("__eq__", &Attribute::operator==)
@@ -2404,12 +2439,23 @@ std::shared_ptr<Program> ApplyFusedBnAddActPass(
   return program;
 }
 
+void DumpPirPyCodeIfNeed(const std::shared_ptr<Program> &program,
+                         const std::string &file_name) {
+#ifdef PADDLE_WITH_CINN
+  ::cinn::dialect::ir::PirToPyCodeConverter(program.get())
+      .file_name(file_name)
+      .dump_symbolic_shape(FLAGS_logging_pir_py_code_dump_symbolic_dims)
+      .SaveIfFlagEnabled();
+#endif
+}
+
 void BindIrPass(pybind11::module *m) {
   m->def("apply_cinn_pass", ApplyCinnPass);
   m->def("check_infer_symbolic_if_need", CheckInferSymbolicIfNeed);
   m->def("infer_symbolic_shape_pass", InferSymbolicShapePass);
   m->def("apply_cse_pass", ApplyCommonSubexpressionEliminationPass);
   m->def("apply_bn_add_act_pass", ApplyFusedBnAddActPass);
+  m->def("dump_pir_py_code_if_need", DumpPirPyCodeIfNeed);
 
   py::class_<Pass, std::shared_ptr<Pass>> pass(*m,
                                                "Pass",
@@ -2487,6 +2533,7 @@ void BindPir(pybind11::module *module) {
   BindOperation(&ir_module);
   BindOpOperand(&ir_module);
   BindType(&ir_module);
+  BindVectorType(&ir_module);
   BindAttribute(&ir_module);
   BindInsertionPoint(&ir_module);
   BindUtils(&ir_module);
diff --git a/paddle/fluid/pybind/place.cc b/paddle/fluid/pybind/place.cc
index c97c9cdc94d7d..adf5852aabb64 100644
--- a/paddle/fluid/pybind/place.cc
+++ b/paddle/fluid/pybind/place.cc
@@ -181,8 +181,7 @@ PYBIND11_MAKE_OPAQUE(paddle::framework::FetchUnmergedList);
 PYBIND11_MAKE_OPAQUE(paddle::framework::FetchList);
 PYBIND11_MAKE_OPAQUE(paddle::framework::FetchType);
 
-namespace paddle {
-namespace pybind {
+namespace paddle::pybind {
 PyTypeObject *g_place_pytype = nullptr;
 PyTypeObject *g_customplace_pytype = nullptr;
 PyTypeObject *g_cudaplace_pytype = nullptr;
@@ -680,5 +679,4 @@ void BindPlace(pybind11::module &m) {  // NOLINT
       .def("__str__", string::to_string<const platform::Place &>);
 }
 
-}  // namespace pybind
-}  // namespace paddle
+}  // namespace paddle::pybind
diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
index e99bf851f7c64..89c1e5ee0688d 100644
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -35,8 +35,7 @@ limitations under the License. */
 
 namespace py = pybind11;
 
-namespace paddle {
-namespace pybind {
+namespace paddle::pybind {
 
 PyTypeObject *g_vartype_pytype = nullptr;
 PyTypeObject *g_blockdesc_pytype = nullptr;
@@ -547,5 +546,4 @@ void BindJitProperty(pybind11::module *m) {
       .def("parse_from_string", DeserializeMessage<jit::Property>);
 }
 
-}  // namespace pybind
-}  // namespace paddle
+}  // namespace paddle::pybind
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index b1163adc932fc..ae49f2594ce0a 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -62,7 +62,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/framework/parallel_executor.h"
 #include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/framework/prune.h"
 #include "paddle/fluid/framework/raw_tensor.h"
@@ -146,7 +145,6 @@ limitations under the License. */
 #endif
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/pybind/compiled_program.h"
-#include "paddle/fluid/pybind/parallel_executor.h"
 #include "paddle/fluid/pybind/place.h"
 #include "paddle/fluid/pybind/protobuf.h"
 #include "paddle/fluid/pybind/pybind.h"  // NOLINT
diff --git a/paddle/fluid/pybind/rpc.cc b/paddle/fluid/pybind/rpc.cc
index ee35e9c3a4164..bc947af36f9a1 100644
--- a/paddle/fluid/pybind/rpc.cc
+++ b/paddle/fluid/pybind/rpc.cc
@@ -19,7 +19,6 @@
 
 namespace py = pybind11;
 using paddle::distributed::FutureWrapper;
-using paddle::distributed::PythonRpcHandler;
 using paddle::distributed::RpcAgent;
 using paddle::distributed::WorkerInfo;
 namespace paddle {
diff --git a/paddle/fluid/pybind/xpu_streams_py.cc b/paddle/fluid/pybind/xpu_streams_py.cc
index 044b954ce6b65..dc60ed8468798 100644
--- a/paddle/fluid/pybind/xpu_streams_py.cc
+++ b/paddle/fluid/pybind/xpu_streams_py.cc
@@ -25,8 +25,7 @@
 
 namespace py = pybind11;
 
-namespace paddle {
-namespace pybind {
+namespace paddle::pybind {
 void BindXpuStream(py::module *m_ptr) {
   auto &m = *m_ptr;
 
@@ -38,7 +37,10 @@ void BindXpuStream(py::module *m_ptr) {
     }
     int curr_device_id = paddle::platform::GetXPUCurrentDeviceId();
     paddle::platform::SetXPUDeviceId(device_id);
-    PADDLE_ENFORCE_XPU_SUCCESS(xpu_wait());
+    auto place = phi::XPUPlace(device_id);
+    auto *dev_ctx = static_cast<phi::XPUContext *>(
+        paddle::platform::DeviceContextPool::Instance().Get(place));
+    dev_ctx->Wait();
     paddle::platform::SetXPUDeviceId(curr_device_id);
 #else
     PADDLE_THROW(platform::errors::Unavailable(
@@ -47,5 +49,4 @@ void BindXpuStream(py::module *m_ptr) {
   });
 }
 
-}  // namespace pybind
-}  // namespace paddle
+}  // namespace paddle::pybind
diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt
index 0788d6994ce3d..84eb2c5d39693 100644
--- a/paddle/phi/CMakeLists.txt
+++ b/paddle/phi/CMakeLists.txt
@@ -112,6 +112,8 @@ set(PHI_SRCS
     ${infermeta_srcs}
     ${capi_srcs})
 
+set(PHI_KERNEL_GPU_SRCS ${kernels_gpu_srcs})
+
 if(WITH_SHARED_PHI)
   set(PHI_BUILD_TYPE
       SHARED
@@ -205,6 +207,40 @@ set(PHI_LIB
     "${CMAKE_CURRENT_BINARY_DIR}/${PHI_NAME}"
     CACHE FILEPATH "PHI Library" FORCE)
 
+# NOTE(silverling): what we are doing here is to build a library `phi_kernel_gpu`
+# that contains all GPU kernels implementation. This can allow paddle be built
+# with more CUDA archs and reduce the binary size of `phi` library.
+if(WITH_GPU OR WITH_ROCM)
+  if(WITH_GPU)
+    nv_library(
+      phi_kernel_gpu ${PHI_BUILD_TYPE}
+      SRCS ${PHI_KERNEL_GPU_SRCS}
+      DEPS ${PHI_DEPS})
+  elseif(WITH_ROCM)
+    hip_library(
+      phi_kernel_gpu ${PHI_BUILD_TYPE}
+      SRCS ${PHI_KERNEL_GPU_SRCS}
+      DEPS ${PHI_DEPS})
+  endif()
+
+  # NOTE(silverling): making library `phi` depend on `phi_kernel_gpu` (even `phi` does not use it)
+  # will make targets that depend on `phi` also automatically depend on `phi_kernel_gpu`.
+  # This will make users' life easier.
+  target_link_libraries(phi phi_kernel_gpu)
+
+  # NOTE(silverling): `phi_kernel_gpu` needs symbols from `phi`.
+  # When it's a shared library, it will work with no problem.
+  # But when it's a static library, it must be linked to `phi` at link time explicitly.
+  if(NOT WITH_SHARED_PHI)
+    target_link_libraries(phi_kernel_gpu phi)
+  endif()
+
+  string(REPLACE "phi" "phi_kernel_gpu" PHI_KERNEL_GPU_NAME ${PHI_NAME})
+  set(PHI_KERNEL_GPU_LIB
+      "${CMAKE_CURRENT_BINARY_DIR}/${PHI_KERNEL_GPU_NAME}"
+      CACHE FILEPATH "PHI Kernel GPU Library" FORCE)
+endif()
+
 if(MKL_FOUND AND WITH_ONEMKL)
   target_include_directories(phi PRIVATE ${MKL_INCLUDE})
 endif()
diff --git a/paddle/phi/api/generator/dist_api_gen.py b/paddle/phi/api/generator/dist_api_gen.py
index 54605d19b256d..aed5d2c28d571 100644
--- a/paddle/phi/api/generator/dist_api_gen.py
+++ b/paddle/phi/api/generator/dist_api_gen.py
@@ -295,7 +295,7 @@
     }}
     std::vector<phi::MetaTensor*> {name}_meta_ptr_vec({name}.size());
     for (size_t i = 0; i < {name}_meta_vec.size(); ++i) {{
-      {name}_meta_ptr_vec[i] = &{name}_meta_vec[i];
+      {name}_meta_ptr_vec[i] = {name}[i] ? &{name}_meta_vec[i] : nullptr;
     }}
 """
 INFER_GLOBAL_SHAPE_TEMPLATE = """
@@ -400,7 +400,7 @@
       std::vector<phi::MetaTensor> {name}_meta_vec = MakeMetaTensor({name});
       std::vector<phi::MetaTensor*> {name}_meta_ptr_vec({name}_meta_vec.size());
       for (size_t i = 0; i < {name}_meta_vec.size(); ++i) {{
-        {name}_meta_ptr_vec[i] = &{name}_meta_vec[i];
+        {name}_meta_ptr_vec[i] = {name}[i] ? &{name}_meta_vec[i] : nullptr;
       }}
 """
 INFER_META_TEMPLATE = """
@@ -1106,9 +1106,7 @@ def generate_output_creation_code(self) -> str:
                         )
                     else:
                         if (
-                            self.need_to_generate_code_for_inplace_or_view_impl(
-                                i
-                            )
+                            self.need_to_generate_code_for_inplace_impl(i)
                             and self.generate_general_infer_spmd
                         ):
                             output_creation_code += (
diff --git a/paddle/phi/api/generator/dist_bw_api_gen.py b/paddle/phi/api/generator/dist_bw_api_gen.py
index 1d57d552d7767..34d495d9d0536 100644
--- a/paddle/phi/api/generator/dist_bw_api_gen.py
+++ b/paddle/phi/api/generator/dist_bw_api_gen.py
@@ -53,33 +53,41 @@
     std::shared_ptr<phi::distributed::DistTensor> shared_dist_out =
         CreateKernelDistOutput({}, !rank_is_in_current_mesh, spmd_info.second[0]);
     phi::distributed::DistTensor* dist_out = shared_dist_out.get();
-    phi::DenseTensor* dense_out = dist_out->unsafe_mutable_value();
-    if (dense_out && !rank_is_in_current_mesh && !dist_out->defined()) {{
-      *dense_out = phi::DenseTensor(
+    phi::DenseTensor* dense_out = nullptr;
+    if (dist_out) {{
+      dense_out = dist_out->unsafe_mutable_value();
+      if (dense_out && !rank_is_in_current_mesh && !dist_out->defined()) {{
+        *dense_out = phi::DenseTensor(
             std::make_shared<phi::Allocation>(nullptr, 0, phi::distributed::GetDefaultPlace()),
             phi::DenseTensorMeta());
     }}
+    }}
 """
 SINGLE_OUT_CREATION_TEMPLATE = """
     std::shared_ptr<phi::distributed::DistTensor> shared_dist_out =
         CreateKernelDistOutput({}, !rank_is_in_current_mesh);
     phi::distributed::DistTensor* dist_out = shared_dist_out.get();
-    phi::DenseTensor* dense_out = dist_out->unsafe_mutable_value();
-    if (dense_out && !rank_is_in_current_mesh && !dist_out->defined()) {{
+    phi::DenseTensor* dense_out = nullptr;
+    if (dist_out) {{
+      dense_out = dist_out->unsafe_mutable_value();
+      if (dense_out && !rank_is_in_current_mesh && !dist_out->defined()) {{
       *dense_out = phi::DenseTensor(
-            std::make_shared<phi::Allocation>(nullptr, 0, phi::distributed::GetDefaultPlace()),
-            phi::DenseTensorMeta());
+                std::make_shared<phi::Allocation>(nullptr, 0, phi::distributed::GetDefaultPlace()),
+                phi::DenseTensorMeta());
+    }}
     }}
 """
 VECTOR_OUT_CREATION_TEMPLATE_WITH_NO_SPMD = """
     auto dist_out = SetKernelDistOutput({name});
-    std::vector<phi::DenseTensor*> dense_out(dist_out.size());
+    std::vector<phi::DenseTensor*> dense_out(dist_out.size(), nullptr);
     for (size_t i=0; i<dist_out.size(); i++) {{
-      dense_out[i] = dist_out[i]->unsafe_mutable_value();
-      if (dense_out[i] && !rank_is_in_current_mesh && !dist_out[i]->defined()) {{
-        *dense_out[i] = phi::DenseTensor(
-              std::make_shared<phi::Allocation>(nullptr, 0, phi::distributed::GetDefaultPlace()),
-              phi::DenseTensorMeta());
+      if (dist_out[i]) {{
+        dense_out[i] = dist_out[i]->unsafe_mutable_value();
+        if (dense_out[i] && !rank_is_in_current_mesh && !dist_out[i]->defined()) {{
+            *dense_out[i] = phi::DenseTensor(
+                std::make_shared<phi::Allocation>(nullptr, 0, phi::distributed::GetDefaultPlace()),
+                phi::DenseTensorMeta());
+        }}
       }}
     }}
 """
@@ -90,13 +98,15 @@
     for(auto& e: shared_dist_out){{
       dist_out.push_back(e.get());
     }}
-    std::vector<phi::DenseTensor*> dense_out(dist_out.size());
+    std::vector<phi::DenseTensor*> dense_out(dist_out.size(), nullptr);
     for (size_t i=0; i<dist_out.size(); i++) {{
-      dense_out[i] = dist_out[i]->unsafe_mutable_value();
-      if (dense_out[i] && !rank_is_in_current_mesh && !dist_out[i]->defined()) {{
-        *dense_out[i] = phi::DenseTensor(
-              std::make_shared<phi::Allocation>(nullptr, 0, phi::distributed::GetDefaultPlace()),
-              phi::DenseTensorMeta());
+      if (dist_out[i]) {{
+        dense_out[i] = dist_out[i]->unsafe_mutable_value();
+        if (dense_out[i] && !rank_is_in_current_mesh && !dist_out[i]->defined()) {{
+            *dense_out[i] = phi::DenseTensor(
+                std::make_shared<phi::Allocation>(nullptr, 0, phi::distributed::GetDefaultPlace()),
+                phi::DenseTensorMeta());
+        }}
       }}
     }}
 """
@@ -108,13 +118,15 @@
     for(auto& e: shared_dist_out){{
       dist_out.push_back(e.get());
     }}
-    std::vector<phi::DenseTensor*> dense_out(dist_out.size());
+    std::vector<phi::DenseTensor*> dense_out(dist_out.size(), nullptr);
     for (size_t i=0; i<dist_out.size(); i++) {{
-      dense_out[i] = dist_out[i]->unsafe_mutable_value();
-      if (dense_out[i] && !rank_is_in_current_mesh && !dist_out[i]->defined()) {{
-        *dense_out[i] = phi::DenseTensor(
-              std::make_shared<phi::Allocation>(nullptr, 0, phi::distributed::GetDefaultPlace()),
-              phi::DenseTensorMeta());
+      if (dist_out[i]) {{
+        dense_out[i] = dist_out[i]->unsafe_mutable_value();
+        if (dense_out[i] && !rank_is_in_current_mesh && !dist_out[i]->defined()) {{
+            *dense_out[i] = phi::DenseTensor(
+                std::make_shared<phi::Allocation>(nullptr, 0, phi::distributed::GetDefaultPlace()),
+                phi::DenseTensorMeta());
+        }}
       }}
     }}
 """
@@ -156,13 +168,15 @@
 """
 MULTI_VECTOR_OUT_CREATION_TEMPLATE = """
     auto dist_out_{i} = SetKernelDistOutput({name});
-    std::vector<phi::DenseTensor*> dense_out_{i}(dist_out_{i}.size());
+    std::vector<phi::DenseTensor*> dense_out_{i}(dist_out_{i}.size(), nullptr);
     for (size_t i = 0; i < dist_out_{i}.size(); i++) {{
-      dense_out_{i}[i] = const_cast<phi::DenseTensor*>(&dist_out_{i}[i]->value());
-      if (dense_out_{i}[i] && !rank_is_in_current_mesh && !dist_out_{i}[i]->defined()) {{
-        *dense_out_{i}[i]= phi::DenseTensor(
-            std::make_shared<phi::Allocation>(nullptr, 0, phi::distributed::GetDefaultPlace()),
-            phi::DenseTensorMeta());
+      if (dist_out_{i}[i]) {{
+        dense_out_{i}[i] = const_cast<phi::DenseTensor*>(&dist_out_{i}[i]->value());
+        if (dense_out_{i}[i] && !rank_is_in_current_mesh && !dist_out_{i}[i]->defined()) {{
+            *dense_out_{i}[i]= phi::DenseTensor(
+                std::make_shared<phi::Allocation>(nullptr, 0, phi::distributed::GetDefaultPlace()),
+                phi::DenseTensorMeta());
+        }}
       }}
     }}
 """
diff --git a/paddle/phi/api/include/tensor_utils.h b/paddle/phi/api/include/tensor_utils.h
index ada842835ffd8..3c2307fd01f0c 100644
--- a/paddle/phi/api/include/tensor_utils.h
+++ b/paddle/phi/api/include/tensor_utils.h
@@ -62,7 +62,7 @@ PADDLE_API Tensor from_blob(void* data,
  * @note Input of `Reshard` should be a `paddle::Tensor` whose impl is
  * shared_ptr of DistTensor. According to the given DistAttr, input will be
  * reshard to wanted distributed state. And it will return shared_ptr of a new
- * DistTensor as outptut.
+ * DistTensor as output.
  *
  * @param input The input tensor to be resharded.
  * @param dist_attr The dist_attr to be resharded.
diff --git a/paddle/phi/api/lib/api_gen_utils.cc b/paddle/phi/api/lib/api_gen_utils.cc
index ef5cfc90727ff..c6426898371d2 100644
--- a/paddle/phi/api/lib/api_gen_utils.cc
+++ b/paddle/phi/api/lib/api_gen_utils.cc
@@ -736,6 +736,7 @@ std::shared_ptr<phi::distributed::DistTensor> CreateKernelDistOutput(
     }
     return dist_output;
   }
+  VLOG(4) << "CreateKernelDistOutput with NULL out";
   return nullptr;
 }
 
diff --git a/paddle/phi/api/lib/context_pool.cc b/paddle/phi/api/lib/context_pool.cc
index ee1e21a58e2f1..e2eb1af09d8a5 100644
--- a/paddle/phi/api/lib/context_pool.cc
+++ b/paddle/phi/api/lib/context_pool.cc
@@ -23,8 +23,7 @@ limitations under the License. */
 #include "paddle/phi/core/cuda_stream.h"
 #endif
 
-namespace paddle {
-namespace experimental {
+namespace paddle::experimental {
 
 void DeviceContextPool::SyncDeviceContext(const Place& place) {
   if (!phi::DeviceContextPool::IsInitialized()) {
@@ -64,8 +63,7 @@ phi::DeviceContext* DeviceContextPool::GetMutable(const Place& place) {
   return const_cast<phi::DeviceContext*>(Get(place));  // NOLINT
 }
 
-}  // namespace experimental
-}  // namespace paddle
+}  // namespace paddle::experimental
 
 namespace paddle {
 
diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc
index 28aecaf64094c..b9962901851dc 100644
--- a/paddle/phi/api/lib/data_transform.cc
+++ b/paddle/phi/api/lib/data_transform.cc
@@ -303,6 +303,7 @@ phi::DenseTensor CheckAndTrans2NewContiguousTensor(
 std::vector<phi::DenseTensor> CheckAndTrans2NewContiguousTensor(
     const std::vector<phi::DenseTensor>& tensor) {
   std::vector<phi::DenseTensor> out;
+  out.reserve(tensor.size());
   for (auto& t : tensor) {
     out.emplace_back(CheckAndTrans2NewContiguousTensor(t));
   }
diff --git a/paddle/phi/api/lib/data_transform.h b/paddle/phi/api/lib/data_transform.h
index 65729a01c20d4..9e023428a7672 100644
--- a/paddle/phi/api/lib/data_transform.h
+++ b/paddle/phi/api/lib/data_transform.h
@@ -64,7 +64,7 @@ class TransformFlag {
   // trans_data_type_ can be setted by api[data_transform->support_trans_dtype]
   // in the yaml file.
   // trans_data_type_ only affect the non complex types,
-  // the complex is always transferd, except stop_transform_ is true.
+  // the complex is always transfered, except stop_transform_ is true.
   bool trans_data_type_ = false;
 
   // trans_backend_ and trans_layout_ are true defaultly,
diff --git a/paddle/phi/api/lib/kernel_dispatch.cc b/paddle/phi/api/lib/kernel_dispatch.cc
index 94b04404c4ced..16dea76b4ad68 100644
--- a/paddle/phi/api/lib/kernel_dispatch.cc
+++ b/paddle/phi/api/lib/kernel_dispatch.cc
@@ -26,9 +26,7 @@ limitations under the License. */
 #include "paddle/phi/backends/device_manager.h"
 #endif
 
-namespace paddle {
-namespace experimental {
-namespace detail {
+namespace paddle::experimental::detail {
 
 // We need judge whether the allocation is nullptr,
 // whether the allocation is initialized, wo we need GetHolder method
@@ -109,7 +107,8 @@ std::size_t CountLeadingZeros(uint32_t val) {
 #endif
 }
 
-}  // namespace detail
+}  // namespace paddle::experimental::detail
+namespace paddle::experimental {
 
 phi::DeviceContext* GetDeviceContextByBackend(phi::Backend backend) {
   auto& pool = paddle::experimental::DeviceContextPool::Instance();
@@ -182,5 +181,4 @@ phi::DataLayout ParseLayoutWithInputOrder(phi::DataLayout layout,
   return layout != phi::DataLayout::UNDEFINED ? layout : ParseLayout(tensor);
 }
 
-}  // namespace experimental
-}  // namespace paddle
+}  // namespace paddle::experimental
diff --git a/paddle/phi/api/lib/scalar.cc b/paddle/phi/api/lib/scalar.cc
index fd13e0809fadd..38cd92057931a 100644
--- a/paddle/phi/api/lib/scalar.cc
+++ b/paddle/phi/api/lib/scalar.cc
@@ -19,8 +19,7 @@ limitations under the License. */
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/enforce.h"
 
-namespace paddle {
-namespace experimental {
+namespace paddle::experimental {
 
 template <>
 ScalarBase<Tensor>::ScalarBase(const Tensor& tensor_in)
@@ -53,5 +52,4 @@ ScalarBase<Tensor>::ScalarBase(const Tensor& tensor_in)
   }
 }
 
-}  // namespace experimental
-}  // namespace paddle
+}  // namespace paddle::experimental
diff --git a/paddle/phi/api/lib/tensor_utils.cc b/paddle/phi/api/lib/tensor_utils.cc
index c6cce9577e9ec..b411eb1e69377 100644
--- a/paddle/phi/api/lib/tensor_utils.cc
+++ b/paddle/phi/api/lib/tensor_utils.cc
@@ -124,7 +124,7 @@ PADDLE_API std::shared_ptr<phi::distributed::DistTensor> reshard(
                         typeid(input.impl().get()).name()));
   auto dev_ctx = phi::distributed::GetDistTensorDeviceContext(
       static_cast<phi::distributed::DistTensor*>(input.impl().get()));
-  auto input_tensor_impl = input.impl();
+  const auto& input_tensor_impl = input.impl();
   std::shared_ptr<phi::distributed::DistTensor> dist_out_ptr = nullptr;
   if (input_tensor_impl) {
     phi::distributed::DistTensor* dist_tensor =
diff --git a/paddle/phi/api/profiler/trace_event.h b/paddle/phi/api/profiler/trace_event.h
index e526953d5c8e0..b74f1754ee318 100644
--- a/paddle/phi/api/profiler/trace_event.h
+++ b/paddle/phi/api/profiler/trace_event.h
@@ -49,7 +49,7 @@ enum class TracerEventType {
   Communication = 12,
   // Used to mark python api
   PythonOp = 13,
-  // Used to mark python level userdefined
+  // Used to mark python level user-defined
   PythonUserDefined = 14,
   // A flag to denote the number of current types
   NumTypes
diff --git a/paddle/phi/backends/custom/custom_device.cc b/paddle/phi/backends/custom/custom_device.cc
index 624aabeffaba7..d669eab67af42 100644
--- a/paddle/phi/backends/custom/custom_device.cc
+++ b/paddle/phi/backends/custom/custom_device.cc
@@ -587,7 +587,7 @@ class CustomDevice : public DeviceInterface {
 #undef return_result
   }
 
-  C_DataType ToCDatatType(phi::DataType data_type) {
+  C_DataType ToCDataType(phi::DataType data_type) {
 #define return_result(in, ret) \
   case in:                     \
     return C_DataType::ret
@@ -669,7 +669,7 @@ class CustomDevice : public DeviceInterface {
         send_buf,
         recv_buf,
         count,
-        ToCDatatType(data_type),
+        ToCDataType(data_type),
         ToXCCLReduceOp(op),
         reinterpret_cast<C_CCLComm>(comm),
         reinterpret_cast<C_Stream>(stream.raw_stream())));
@@ -685,7 +685,7 @@ class CustomDevice : public DeviceInterface {
     PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->xccl_broadcast(
         buf,
         count,
-        ToCDatatType(data_type),
+        ToCDataType(data_type),
         root,
         reinterpret_cast<C_CCLComm>(comm),
         reinterpret_cast<C_Stream>(stream.raw_stream())));
@@ -704,7 +704,7 @@ class CustomDevice : public DeviceInterface {
         pimpl_->xccl_reduce(in_data,
                             out_data,
                             num,
-                            ToCDatatType(data_type),
+                            ToCDataType(data_type),
                             ToXCCLReduceOp(reduce_op),
                             root_id,
                             reinterpret_cast<C_CCLComm>(comm),
@@ -722,7 +722,7 @@ class CustomDevice : public DeviceInterface {
         send_buf,
         recv_buf,
         count,
-        ToCDatatType(data_type),
+        ToCDataType(data_type),
         reinterpret_cast<C_CCLComm>(comm),
         reinterpret_cast<C_Stream>(stream.raw_stream())));
   }
@@ -739,7 +739,7 @@ class CustomDevice : public DeviceInterface {
         send_buf,
         recv_buf,
         count,
-        ToCDatatType(data_type),
+        ToCDataType(data_type),
         ToXCCLReduceOp(reduce_op),
         reinterpret_cast<C_CCLComm>(comm),
         reinterpret_cast<C_Stream>(stream.raw_stream())));
@@ -767,7 +767,7 @@ class CustomDevice : public DeviceInterface {
     PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
         pimpl_->xccl_send(send_buf,
                           count,
-                          ToCDatatType(data_type),
+                          ToCDataType(data_type),
                           dest_rank,
                           reinterpret_cast<C_CCLComm>(comm),
                           reinterpret_cast<C_Stream>(stream.raw_stream())));
@@ -783,7 +783,7 @@ class CustomDevice : public DeviceInterface {
     PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
         pimpl_->xccl_recv(recv_buf,
                           count,
-                          ToCDatatType(data_type),
+                          ToCDataType(data_type),
                           src_rank,
                           reinterpret_cast<C_CCLComm>(comm),
                           reinterpret_cast<C_Stream>(stream.raw_stream())));
@@ -802,8 +802,8 @@ class CustomDevice : public DeviceInterface {
     if (pimpl_->xccl_all_to_all) {
       std::vector<C_DataType> c_send_dtype, c_recv_dtype;
       for (size_t i = 0; i < nranks; ++i) {
-        c_send_dtype.push_back(ToCDatatType(send_dtype[i]));
-        c_recv_dtype.push_back(ToCDatatType(recv_dtype[i]));
+        c_send_dtype.push_back(ToCDataType(send_dtype[i]));
+        c_recv_dtype.push_back(ToCDataType(recv_dtype[i]));
       }
       PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->xccl_all_to_all(
           send_buf,
@@ -823,7 +823,7 @@ class CustomDevice : public DeviceInterface {
         PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
             pimpl_->xccl_recv(recv_buf[i],
                               recv_count[i],
-                              ToCDatatType(recv_dtype[i]),
+                              ToCDataType(recv_dtype[i]),
                               i,
                               reinterpret_cast<C_CCLComm>(comm),
                               reinterpret_cast<C_Stream>(stream.raw_stream())));
@@ -833,7 +833,7 @@ class CustomDevice : public DeviceInterface {
           PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->xccl_send(
               const_cast<void*>(send_buf[i]),
               send_count[i],
-              ToCDatatType(send_dtype[i]),
+              ToCDataType(send_dtype[i]),
               i,
               reinterpret_cast<C_CCLComm>(comm),
               reinterpret_cast<C_Stream>(stream.raw_stream())));
@@ -848,7 +848,7 @@ class CustomDevice : public DeviceInterface {
         PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
             pimpl_->xccl_recv(recv_buf[i],
                               recv_count[i],
-                              ToCDatatType(recv_dtype[i]),
+                              ToCDataType(recv_dtype[i]),
                               i,
                               reinterpret_cast<C_CCLComm>(comm),
                               reinterpret_cast<C_Stream>(stream.raw_stream())));
@@ -872,7 +872,7 @@ class CustomDevice : public DeviceInterface {
     PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
         pimpl_->blas_axpby(device,
                            reinterpret_cast<C_Stream>(stream.raw_stream()),
-                           ToCDatatType(dtype),
+                           ToCDataType(dtype),
                            numel,
                            alpha,
                            x,
diff --git a/paddle/phi/backends/device_memory_aligment.h b/paddle/phi/backends/device_memory_alignment.h
similarity index 100%
rename from paddle/phi/backends/device_memory_aligment.h
rename to paddle/phi/backends/device_memory_alignment.h
diff --git a/paddle/phi/backends/dynload/cublas.cc b/paddle/phi/backends/dynload/cublas.cc
index 2fe9ae774bf7a..b870a90cb091c 100644
--- a/paddle/phi/backends/dynload/cublas.cc
+++ b/paddle/phi/backends/dynload/cublas.cc
@@ -14,8 +14,7 @@ limitations under the License. */
 
 #include "paddle/phi/backends/dynload/cublas.h"
 
-namespace phi {
-namespace dynload {
+namespace phi::dynload {
 std::once_flag cublas_dso_flag;
 void *cublas_dso_handle = nullptr;
 
@@ -34,5 +33,4 @@ CUBLAS_BLAS_ROUTINE_EACH_R3(DEFINE_WRAP);
 #ifdef CUBLAS_BLAS_ROUTINE_EACH_R4
 CUBLAS_BLAS_ROUTINE_EACH_R4(DEFINE_WRAP);
 #endif
-}  // namespace dynload
-}  // namespace phi
+}  // namespace phi::dynload
diff --git a/paddle/phi/backends/dynload/cublas.h b/paddle/phi/backends/dynload/cublas.h
index 8053bbb6bd2ce..6da85283d6e71 100644
--- a/paddle/phi/backends/dynload/cublas.h
+++ b/paddle/phi/backends/dynload/cublas.h
@@ -94,8 +94,14 @@ extern void *cublas_dso_handle;
   __macro(cublasSgetriBatched);           \
   __macro(cublasDgetrfBatched);           \
   __macro(cublasDgetriBatched);           \
+  __macro(cublasCgetrfBatched);           \
+  __macro(cublasCgetriBatched);           \
+  __macro(cublasZgetrfBatched);           \
+  __macro(cublasZgetriBatched);           \
   __macro(cublasSmatinvBatched);          \
   __macro(cublasDmatinvBatched);          \
+  __macro(cublasCmatinvBatched);          \
+  __macro(cublasZmatinvBatched);          \
   __macro(cublasSgetrsBatched);           \
   __macro(cublasDgetrsBatched);
 
diff --git a/paddle/phi/backends/dynload/cuda_driver.cc b/paddle/phi/backends/dynload/cuda_driver.cc
index d9fd89a0c65a6..afd6fbb76f460 100644
--- a/paddle/phi/backends/dynload/cuda_driver.cc
+++ b/paddle/phi/backends/dynload/cuda_driver.cc
@@ -14,8 +14,7 @@ limitations under the License. */
 
 #include "paddle/phi/backends/dynload/cuda_driver.h"
 
-namespace phi {
-namespace dynload {
+namespace phi::dynload {
 
 std::once_flag cuda_dso_flag;
 void* cuda_dso_handle = nullptr;
@@ -33,5 +32,4 @@ bool HasCUDADriver() {
   return cuda_dso_handle != nullptr;
 }
 
-}  // namespace dynload
-}  // namespace phi
+}  // namespace phi::dynload
diff --git a/paddle/phi/backends/dynload/cupti.cc b/paddle/phi/backends/dynload/cupti.cc
index 1d6e7c86c24d0..43fb64fd6f0a3 100644
--- a/paddle/phi/backends/dynload/cupti.cc
+++ b/paddle/phi/backends/dynload/cupti.cc
@@ -16,8 +16,7 @@ limitations under the License. */
 
 #include "paddle/phi/backends/dynload/cupti.h"
 
-namespace phi {
-namespace dynload {
+namespace phi::dynload {
 
 std::once_flag cupti_dso_flag;
 void *cupti_dso_handle = nullptr;
@@ -26,7 +25,6 @@ void *cupti_dso_handle = nullptr;
 
 CUPTI_ROUTINE_EACH(DEFINE_WRAP);
 
-}  // namespace dynload
-}  // namespace phi
+}  // namespace phi::dynload
 
 #endif  // PADDLE_WITH_CUPTI
diff --git a/paddle/phi/backends/dynload/cusolver.cc b/paddle/phi/backends/dynload/cusolver.cc
index a5c88cf525c91..578edf14b49ed 100644
--- a/paddle/phi/backends/dynload/cusolver.cc
+++ b/paddle/phi/backends/dynload/cusolver.cc
@@ -14,8 +14,7 @@ limitations under the License. */
 
 #include "paddle/phi/backends/dynload/cusolver.h"
 
-namespace phi {
-namespace dynload {
+namespace phi::dynload {
 
 std::once_flag cusolver_dso_flag;
 void *cusolver_dso_handle;
@@ -32,5 +31,4 @@ CUSOLVER_ROUTINE_EACH_R1(DEFINE_WRAP);
 CUSOLVER_ROUTINE_EACH_R2(DEFINE_WRAP);
 #endif
 
-}  // namespace dynload
-}  // namespace phi
+}  // namespace phi::dynload
diff --git a/paddle/phi/backends/dynload/cusparse.cc b/paddle/phi/backends/dynload/cusparse.cc
index ce8f87dc3cdfa..9d89b746df5b7 100644
--- a/paddle/phi/backends/dynload/cusparse.cc
+++ b/paddle/phi/backends/dynload/cusparse.cc
@@ -14,8 +14,7 @@ limitations under the License. */
 
 #include "paddle/phi/backends/dynload/cusparse.h"
 
-namespace phi {
-namespace dynload {
+namespace phi::dynload {
 
 std::once_flag cusparse_dso_flag;
 void *cusparse_dso_handle;
@@ -34,5 +33,4 @@ CUSPARSE_ROUTINE_EACH_R2(DEFINE_WRAP);
 CUSPARSE_ROUTINE_EACH_R3(DEFINE_WRAP);
 #endif
 
-}  // namespace dynload
-}  // namespace phi
+}  // namespace phi::dynload
diff --git a/paddle/phi/backends/dynload/cutlass_conv2d.cc b/paddle/phi/backends/dynload/cutlass_conv2d.cc
index 936a04fa3023c..a72eaba46eb0d 100644
--- a/paddle/phi/backends/dynload/cutlass_conv2d.cc
+++ b/paddle/phi/backends/dynload/cutlass_conv2d.cc
@@ -16,8 +16,7 @@
 #include <string>
 #include "paddle/phi/core/enforce.h"
 
-namespace phi {
-namespace dynload {
+namespace phi::dynload {
 
 std::once_flag cutlass_dso_flag;
 void* cutlass_dso_handle;
@@ -53,5 +52,4 @@ void* GetCutlassConv2dHandle() {
   return cutlass_dso_handle;
 }
 
-}  // namespace dynload
-}  // namespace phi
+}  // namespace phi::dynload
diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc
index 612a959fc307b..5d8e26732196d 100644
--- a/paddle/phi/backends/dynload/dynamic_loader.cc
+++ b/paddle/phi/backends/dynload/dynamic_loader.cc
@@ -351,14 +351,14 @@ void* GetCublasDsoHandle() {
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib");
 #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
   if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
-#ifdef WITH_PIP_CUDA_LIBRARIES
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cublas64_11.dll");
 #else
     return GetDsoHandleFromSearchPath(
         FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path});
 #endif
   } else if (CUDA_VERSION >= 12000 && CUDA_VERSION <= 12030) {
-#ifdef WITH_PIP_CUDA_LIBRARIES
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cublas64_12.dll");
 #else
     return GetDsoHandleFromSearchPath(
@@ -372,13 +372,13 @@ void* GetCublasDsoHandle() {
   }
 #elif defined(__linux__) && defined(PADDLE_WITH_CUDA)
   if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
-#ifdef WITH_PIP_CUDA_LIBRARIES
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublas.so.11");
 #else
     return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublas.so");
 #endif
   } else if (CUDA_VERSION >= 12000 && CUDA_VERSION <= 12030) {
-#ifdef WITH_PIP_CUDA_LIBRARIES
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublas.so.12");
 #else
     return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublas.so");
@@ -400,13 +400,13 @@ void* GetCublasLtDsoHandle() {
 // APIs available after CUDA 10.1
 #if defined(__linux__) && defined(PADDLE_WITH_CUDA)
   if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
-#ifdef WITH_PIP_CUDA_LIBRARIES
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublasLt.so.11");
 #else
     return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublasLt.so");
 #endif
   } else if (CUDA_VERSION >= 12000 && CUDA_VERSION <= 12030) {
-#ifdef WITH_PIP_CUDA_LIBRARIES
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublasLt.so.12");
 #else
     return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublasLt.so");
@@ -448,7 +448,7 @@ void* GetCUDNNDsoHandle() {
       "You should do this according to your CUDA installation directory and "
       "CUDNN version.");
   if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12030) {
-#ifdef WITH_PIP_CUDA_LIBRARIES
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(
         FLAGS_cuda_dir, "cudnn64_8.dll", true, {cuda_lib_path}, win_warn_meg);
 #else
@@ -456,7 +456,7 @@ void* GetCUDNNDsoHandle() {
         FLAGS_cuda_dir, win_cudnn_lib, true, {cuda_lib_path}, win_warn_meg);
 #endif
   } else if (CUDA_VERSION >= 12030) {
-#ifdef WITH_PIP_CUDA_LIBRARIES
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(
         FLAGS_cuda_dir, "cudnn64_9.dll", true, {cuda_lib_path}, win_warn_meg);
 #else
@@ -467,7 +467,7 @@ void* GetCUDNNDsoHandle() {
 #elif defined(PADDLE_WITH_HIP)
   return GetDsoHandleFromSearchPath(FLAGS_miopen_dir, "libMIOpen.so", false);
 #else
-#ifdef WITH_PIP_CUDA_LIBRARIES
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
   if (CUDA_VERSION >= 12030) {
     return GetDsoHandleFromSearchPath(
         FLAGS_cudnn_dir, "libcudnn.so.9", false, {cuda_lib_path});
@@ -488,7 +488,7 @@ void* GetCUPTIDsoHandle() {
       FLAGS_cupti_dir, "libcupti.dylib", false, {cupti_lib_path});
 #elif defined(__linux__) && defined(PADDLE_WITH_CUDA)
   if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
-#ifdef WITH_PIP_CUDA_LIBRARIES
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(
         FLAGS_cupti_dir, "libcupti.so.11.8", false, {cupti_lib_path});
 #else
@@ -497,7 +497,7 @@ void* GetCUPTIDsoHandle() {
 #endif
 
   } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 12030) {
-#ifdef WITH_PIP_CUDA_LIBRARIES
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(
         FLAGS_cupti_dir, "libcupti.so.12", false, {cupti_lib_path});
 #else
@@ -520,7 +520,7 @@ void* GetCurandDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib");
 #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-#ifdef WITH_PIP_CUDA_LIBRARIES
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
   return GetDsoHandleFromSearchPath(
       FLAGS_cuda_dir, "curand64_10.dll", true, {cuda_lib_path});
 #else
@@ -530,7 +530,7 @@ void* GetCurandDsoHandle() {
 #elif defined(PADDLE_WITH_HIP)
   return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprand.so");
 #else
-#ifdef WITH_PIP_CUDA_LIBRARIES
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
   return GetDsoHandleFromSearchPath(FLAGS_curand_dir, "libcurand.so.10");
 #else
   return GetDsoHandleFromSearchPath(FLAGS_curand_dir, "libcurand.so");
@@ -564,7 +564,7 @@ void* GetCusolverDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.dylib");
 #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-#ifdef WITH_PIP_CUDA_LIBRARIES
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
   return GetDsoHandleFromSearchPath(
       FLAGS_cuda_dir, "cusolver64_11.dll", true, {cuda_lib_path});
 #else
@@ -572,7 +572,7 @@ void* GetCusolverDsoHandle() {
       FLAGS_cuda_dir, win_cusolver_lib, true, {cuda_lib_path});
 #endif
 #else
-#ifdef WITH_PIP_CUDA_LIBRARIES
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.so.11");
 #else
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.so");
@@ -585,14 +585,14 @@ void* GetCusparseDsoHandle() {
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusparse.dylib");
 #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
   if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
-#ifdef WITH_PIP_CUDA_LIBRARIES
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cusparse64_11.dll");
 #else
     return GetDsoHandleFromSearchPath(
         FLAGS_cuda_dir, win_cusparse_lib, true, {cuda_lib_path});
 #endif
   } else if (CUDA_VERSION >= 12000 && CUDA_VERSION <= 12030) {
-#ifdef WITH_PIP_CUDA_LIBRARIES
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cusparse64_12.dll");
 #else
     return GetDsoHandleFromSearchPath(
@@ -606,13 +606,13 @@ void* GetCusparseDsoHandle() {
   }
 #elif defined(__linux__) && defined(PADDLE_WITH_CUDA)
   if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
-#ifdef WITH_PIP_CUDA_LIBRARIES
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(FLAGS_cusparse_dir, "libcusparse.so.11");
 #else
     return GetDsoHandleFromSearchPath(FLAGS_cusparse_dir, "libcusparse.so");
 #endif
   } else if (CUDA_VERSION >= 12000 && CUDA_VERSION <= 12030) {
-#ifdef WITH_PIP_CUDA_LIBRARIES
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(FLAGS_cusparse_dir, "libcusparse.so.12");
 #else
     return GetDsoHandleFromSearchPath(FLAGS_cusparse_dir, "libcusparse.so");
@@ -716,7 +716,7 @@ void* GetNCCLDsoHandle() {
   return GetDsoHandleFromSearchPath(
       FLAGS_rccl_dir, "librccl.so", true, {}, warning_msg);
 #else
-#ifdef WITH_PIP_CUDA_LIBRARIES
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
   return GetDsoHandleFromSearchPath(
       FLAGS_nccl_dir, "libnccl.so;libnccl.so.2", true, {}, warning_msg);
 #else
@@ -782,7 +782,7 @@ void* GetCUFFTDsoHandle() {
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.dylib");
 #elif defined(__linux__) && defined(PADDLE_WITH_CUDA)
   if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
-#ifdef WITH_PIP_CUDA_LIBRARIES
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.so.10");
 #else
     return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.so");
@@ -797,14 +797,14 @@ void* GetCUFFTDsoHandle() {
   }
 #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
   if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
-#ifdef WITH_PIP_CUDA_LIBRARIES
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cufft64_10.dll");
 #else
     return GetDsoHandleFromSearchPath(
         FLAGS_cuda_dir, win_cufft_lib, true, {cuda_lib_path});
 #endif
   } else if (CUDA_VERSION >= 12000 && CUDA_VERSION <= 12030) {
-#ifdef WITH_PIP_CUDA_LIBRARIES
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cufft64_11.dll");
 #else
     return GetDsoHandleFromSearchPath(
diff --git a/paddle/phi/backends/dynload/lapack.cc b/paddle/phi/backends/dynload/lapack.cc
index 9719da9775146..924ea5192cd5c 100644
--- a/paddle/phi/backends/dynload/lapack.cc
+++ b/paddle/phi/backends/dynload/lapack.cc
@@ -16,8 +16,7 @@ limitations under the License. */
 
 #include <mutex>
 
-namespace phi {
-namespace dynload {
+namespace phi::dynload {
 
 std::once_flag lapack_dso_flag;
 void* lapack_dso_handle = nullptr;
@@ -26,5 +25,4 @@ void* lapack_dso_handle = nullptr;
 
 LAPACK_ROUTINE_EACH(DEFINE_WRAP);
 
-}  // namespace dynload
-}  // namespace phi
+}  // namespace phi::dynload
diff --git a/paddle/phi/backends/dynload/nvjpeg.cc b/paddle/phi/backends/dynload/nvjpeg.cc
index 9e9ac77dbaa98..8f13389398504 100644
--- a/paddle/phi/backends/dynload/nvjpeg.cc
+++ b/paddle/phi/backends/dynload/nvjpeg.cc
@@ -11,8 +11,7 @@ limitations under the License. */
 
 #include "paddle/phi/backends/dynload/nvjpeg.h"
 
-namespace phi {
-namespace dynload {
+namespace phi::dynload {
 
 std::once_flag nvjpeg_dso_flag;
 void *nvjpeg_dso_handle;
@@ -21,5 +20,4 @@ void *nvjpeg_dso_handle;
 
 NVJPEG_RAND_ROUTINE_EACH(DEFINE_WRAP);
 
-}  // namespace dynload
-}  // namespace phi
+}  // namespace phi::dynload
diff --git a/paddle/phi/backends/dynload/nvtx.cc b/paddle/phi/backends/dynload/nvtx.cc
index b6bed459f32de..1fb52566fd6ae 100644
--- a/paddle/phi/backends/dynload/nvtx.cc
+++ b/paddle/phi/backends/dynload/nvtx.cc
@@ -14,8 +14,7 @@ limitations under the License. */
 #ifndef _WIN32
 #include "paddle/phi/backends/dynload/nvtx.h"
 
-namespace phi {
-namespace dynload {
+namespace phi::dynload {
 
 std::once_flag nvtx_dso_flag;
 void *nvtx_dso_handle;
@@ -24,6 +23,5 @@ void *nvtx_dso_handle;
 
 NVTX_ROUTINE_EACH(DEFINE_WRAP);
 
-}  // namespace dynload
-}  // namespace phi
+}  // namespace phi::dynload
 #endif
diff --git a/paddle/phi/backends/dynload/tensorrt.cc b/paddle/phi/backends/dynload/tensorrt.cc
index ff4217ce02054..9d21b70e3be01 100644
--- a/paddle/phi/backends/dynload/tensorrt.cc
+++ b/paddle/phi/backends/dynload/tensorrt.cc
@@ -16,8 +16,7 @@
 
 #include <string>
 
-namespace phi {
-namespace dynload {
+namespace phi::dynload {
 
 std::once_flag tensorrt_dso_flag;
 void* tensorrt_dso_handle;
@@ -80,5 +79,4 @@ void* GetTensorRtPluginHandle() {
   return GetDsoHandle(dso_name);
 }
 
-}  // namespace dynload
-}  // namespace phi
+}  // namespace phi::dynload
diff --git a/paddle/phi/backends/gpu/cuda/cuda_graph.cc b/paddle/phi/backends/gpu/cuda/cuda_graph.cc
index 43ec0a0c89c08..ced9c22816c63 100644
--- a/paddle/phi/backends/gpu/cuda/cuda_graph.cc
+++ b/paddle/phi/backends/gpu/cuda/cuda_graph.cc
@@ -25,9 +25,7 @@ cudaError_t cudaGetFuncBySymbol(cudaFunction_t *functionPtr,
 COMMON_DECLARE_bool(use_cuda_malloc_async_allocator);
 COMMON_DECLARE_bool(auto_free_cudagraph_allocations_on_launch);
 
-namespace phi {
-namespace backends {
-namespace gpu {
+namespace phi::backends::gpu {
 
 std::unique_ptr<CUDAGraph> CUDAGraph::capturing_graph_{nullptr};
 paddle::optional<std::thread::id> CUDAGraph::capturing_thread_id_{paddle::none};
@@ -379,6 +377,4 @@ CUDAGraphNodeLauncher::GetParameterSettersForExecGraph(cudaGraph_t graph) {
 }
 #endif
 
-}  // namespace gpu
-}  // namespace backends
-}  // namespace phi
+}  // namespace phi::backends::gpu
diff --git a/paddle/phi/backends/gpu/cuda/cuda_helper.h b/paddle/phi/backends/gpu/cuda/cuda_helper.h
index 555cc2357b2ab..4f12ca02e3060 100644
--- a/paddle/phi/backends/gpu/cuda/cuda_helper.h
+++ b/paddle/phi/backends/gpu/cuda/cuda_helper.h
@@ -43,7 +43,7 @@ namespace gpu {
  *    this time, the cycle condition `i < (n)` is still satisfied, so it
  *    will cause illegal access to cuda memory.
  *
- *    Here is a real example in ERINE, it will trigger above error.
+ *    Here is a real example in ERNIE, it will trigger above error.
  *    The related data are:
  *      - blockIdx.x = 2172938
  *      - blockDim.x = 512
diff --git a/paddle/phi/backends/gpu/cuda/cudnn_workspace_helper.cc b/paddle/phi/backends/gpu/cuda/cudnn_workspace_helper.cc
index d5b484c8eeb56..b4f71dacb3fcb 100644
--- a/paddle/phi/backends/gpu/cuda/cudnn_workspace_helper.cc
+++ b/paddle/phi/backends/gpu/cuda/cudnn_workspace_helper.cc
@@ -17,9 +17,7 @@
 #include <cstdlib>
 #include <string>
 
-namespace phi {
-namespace backends {
-namespace gpu {
+namespace phi::backends::gpu {
 
 static int GetDefaultConvWorkspaceSizeLimitMBImpl() {
   const char *env_str = std::getenv("FLAGS_conv_workspace_size_limit");
@@ -31,6 +29,4 @@ int GetDefaultConvWorkspaceSizeLimitMB() {
   static auto workspace_size = GetDefaultConvWorkspaceSizeLimitMBImpl();
   return workspace_size;
 }
-}  // namespace gpu
-}  // namespace backends
-}  // namespace phi
+}  // namespace phi::backends::gpu
diff --git a/paddle/phi/backends/gpu/gpu_info.cc b/paddle/phi/backends/gpu/gpu_info.cc
index 32546f762c39e..7f83b5c0e1da3 100644
--- a/paddle/phi/backends/gpu/gpu_info.cc
+++ b/paddle/phi/backends/gpu/gpu_info.cc
@@ -24,9 +24,7 @@ limitations under the License. */
 
 COMMON_DECLARE_string(selected_gpus);
 
-namespace phi {
-namespace backends {
-namespace gpu {
+namespace phi::backends::gpu {
 
 static inline std::vector<std::string> Split(std::string const& original,
                                              char separator) {
@@ -83,6 +81,4 @@ size_t GpuMinChunkSize() {
   return 1 << 8;
 }
 
-}  // namespace gpu
-}  // namespace backends
-}  // namespace phi
+}  // namespace phi::backends::gpu
diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc
index fc0f8ee1e35e1..60a4303c4605a 100644
--- a/paddle/phi/backends/xpu/xpu2_op_list.cc
+++ b/paddle/phi/backends/xpu/xpu2_op_list.cc
@@ -1009,6 +1009,10 @@ XPUOpMap& get_kl2_ops() {
       {"swish", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"swish_grad",
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"swiglu",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"swiglu_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"take_along_axis",
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"tanh_grad",
@@ -1055,7 +1059,8 @@ XPUOpMap& get_kl2_ops() {
                      phi::DataType::INT64,
                      phi::DataType::BOOL,
                      phi::DataType::FLOAT64,
-                     phi::DataType::FLOAT32})},
+                     phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16})},
       {"tile_grad", XPUKernelSet({phi::DataType::FLOAT32})},
       {"transpose2_grad",
        XPUKernelSet({phi::DataType::FLOAT32,
@@ -1248,6 +1253,7 @@ XPUOpMap& get_kl2_ops() {
        XPUKernelSet({phi::DataType::FLOAT16, phi::DataType::FLOAT32})},
       {"sequence_unpad_xpu",
        XPUKernelSet({phi::DataType::FLOAT16, phi::DataType::FLOAT32})},
+      {"block_multihead_attention_xpu", XPUKernelSet({phi::DataType::FLOAT16})},
   };
 
   return s_xpu2_kernels;
diff --git a/paddle/phi/backends/xpu/xpu_context.cc b/paddle/phi/backends/xpu/xpu_context.cc
index 1e06c9358a40a..c16822989c849 100644
--- a/paddle/phi/backends/xpu/xpu_context.cc
+++ b/paddle/phi/backends/xpu/xpu_context.cc
@@ -307,7 +307,7 @@ static int64_t get_l3_size(int i) {
 
 XPUContext::XPUContext() : DeviceContext() {
   if (std::getenv("XPU_CDNN_CLUSTER_PARALLEL") != nullptr) {
-    int default_num_stream = 4;
+    int default_num_stream = 2;
     if (std::getenv("XPU_CDNN_CLUSTER_PARALLEL_STREAM_NUMBER") != nullptr) {
       default_num_stream =
           atoi(std::getenv("XPU_CDNN_CLUSTER_PARALLEL_STREAM_NUMBER"));
@@ -348,12 +348,31 @@ XPUContext::~XPUContext() = default;
 
 const Place& XPUContext::GetPlace() const { return impls_[0]->GetPlace(); }
 
-XPUStream XPUContext::stream(int i) const { return impls_[i]->stream(); }
+XPUStream XPUContext::stream(int i) const {
+  CheckValidStreamId(i);
+  return impls_[i]->stream();
+}
 
 void XPUContext::SetStream(void* stream, int i) {
+  CheckValidStreamId(i);
   impls_[i]->SetStream(stream);
 }
 
+void XPUContext::CheckValidStreamId(int i) const {
+  PADDLE_ENFORCE_GE(
+      i,
+      0,
+      errors::InvalidArgument(
+          "The stream index must be greater than or equal to 0."));
+  PADDLE_ENFORCE_LT(
+      i,
+      GetStreamNum(),
+      errors::InvalidArgument("The stream index shoule be less than the number "
+                              "of stream used (%d), but got %d",
+                              GetStreamNum(),
+                              i));
+}
+
 void XPUContext::SetXpuVersion(int version) {
   impls_[0]->xpu_version_ = static_cast<backends::xpu::XPUVersion>(version);
 }
@@ -371,6 +390,7 @@ backends::xpu::XPUVersion XPUContext::xpu_version() const {
 }
 
 xpu::Context* XPUContext::x_context(int i) const {
+  CheckValidStreamId(i);
   return impls_[i]->GetXContext();
 }
 
@@ -385,10 +405,12 @@ void XPUContext::Wait() const {
 }
 
 void XPUContext::SetXContext(xpu::Context* context, int i) {
+  CheckValidStreamId(i);
   impls_[i]->SetXContext(context);
 }
 
 void XPUContext::SetL3Cache(int64_t l3_size, int i) {
+  CheckValidStreamId(i);
   impls_[i]->SetL3Cache(l3_size);
 }
 
@@ -396,7 +418,36 @@ void XPUContext::SetBkclContext(xpu::BKCLContext_t context) {
   impls_[0]->SetBkclContext(context);
 }
 
-void XPUContext::CreateStream(int i) { impls_[i]->CreateStream(); }
+void XPUContext::CreateStream(int i) {
+  CheckValidStreamId(i);
+  impls_[i]->CreateStream();
+}
+
+void XPUContext::RecordEvent(XPUEvent event, int s) const {
+  CheckValidStreamId(s);
+  int r = xpu_event_record(event, stream(s));
+  PADDLE_ENFORCE_XRE_SUCCESS(r);
+}
+
+void XPUContext::StreamWaitEvent(XPUEvent event, int s) const {
+  CheckValidStreamId(s);
+  int r = xpu_stream_wait_event(stream(s), event);
+  PADDLE_ENFORCE_XRE_SUCCESS(r);
+}
+
+void XPUContext::StreamWaitStream(int wait_stream, int record_stream) const {
+  CheckValidStreamId(wait_stream);
+  CheckValidStreamId(record_stream);
+  XPUEvent event;
+  int r = xpu_event_create(&event);
+  PADDLE_ENFORCE_XRE_SUCCESS(r);
+  RecordEvent(event, record_stream);
+  StreamWaitEvent(event, wait_stream);
+  r = xpu_event_destroy(event);
+  PADDLE_ENFORCE_XRE_SUCCESS(r);
+}
+
+int64_t XPUContext::GetStreamNum() const { return impls_.size(); }
 
 void XPUContext::Init() { impls_[0]->Init(); }
 }  // namespace phi
diff --git a/paddle/phi/backends/xpu/xpu_context.h b/paddle/phi/backends/xpu/xpu_context.h
index 1f2aaa1990540..da6e5f10dc362 100644
--- a/paddle/phi/backends/xpu/xpu_context.h
+++ b/paddle/phi/backends/xpu/xpu_context.h
@@ -53,6 +53,10 @@ class XPUContext : public DeviceContext,
   xpu::BKCLContext_t bkcl_context() const;
   void SetBkclContext(xpu::BKCLContext_t context);
   void CreateStream(int i = 0);
+  void RecordEvent(XPUEvent event, int s) const;
+  void StreamWaitEvent(XPUEvent event, int s) const;
+  void StreamWaitStream(int wait_stream, int record_stream) const;
+  int64_t GetStreamNum() const;
 
   // For share external stream.
   void SetStream(void* stream, int i = 0);
@@ -89,6 +93,8 @@ class XPUContext : public DeviceContext,
  private:
   struct Impl;
   std::vector<std::unique_ptr<Impl>> impls_;
+
+  void CheckValidStreamId(int i) const;
 };
 
 // KPS (Kernel PrimitiveS API) needs to exist as a kind of backend,
diff --git a/paddle/phi/common/scalar.cc b/paddle/phi/common/scalar.cc
index e942e2f18cefa..7bbbe619ab39d 100644
--- a/paddle/phi/common/scalar.cc
+++ b/paddle/phi/common/scalar.cc
@@ -19,8 +19,7 @@ limitations under the License. */
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/tensor_utils.h"
-namespace paddle {
-namespace experimental {
+namespace paddle::experimental {
 
 // The Tensor must have one dim
 template <>
@@ -54,5 +53,4 @@ bool operator!=(const Scalar& lhs, const Scalar& rhs) {
 std::ostream& operator<<(std::ostream& os, const Scalar& s) {
   return os << s.ToString();
 }
-}  // namespace experimental
-}  // namespace paddle
+}  // namespace paddle::experimental
diff --git a/paddle/phi/core/distributed/auto_parallel/auto_parallel.proto b/paddle/phi/core/distributed/auto_parallel/auto_parallel.proto
index 70c9e72aa5fe7..71c18ac426019 100644
--- a/paddle/phi/core/distributed/auto_parallel/auto_parallel.proto
+++ b/paddle/phi/core/distributed/auto_parallel/auto_parallel.proto
@@ -25,7 +25,7 @@ message ProcessMeshProto {
   // There are no duplicate process ids within one process mesh.
   repeated int64 process_ids = 2;
 
-  // The name of each dimension. 
+  // The name of each dimension.
   repeated string dim_names = 3;
 
 }
@@ -37,17 +37,17 @@ message TensorDistAttrProto {
   optional ProcessMeshProto process_mesh = 1;
 
   // The length of dims_mapping is same as the length of the tensor shape.
-  // The i-th dimension of the tensor will be sharded by the dims_mapping[i]-th dimension 
+  // The i-th dimension of the tensor will be sharded by the dims_mapping[i]-th dimension
   // of the above process mesh. If dims_mapping[i] is -1, the i-th dimension of the tensor
   // will not be sharded. For example, given a tensor shape [2, 6, 12], a process mesh
   // shape [2, 3] and a dims_mapping [-1, 1, 0], each sharded tensor will have a shape [2, 2, 6].
   repeated int64 dims_mapping = 2;
 
-  // The batch dimension of the corresponding tensor. 
+  // The batch dimension of the corresponding tensor.
   optional int64 batch_dim = 3;
 
-  // If the dynamic_dims[i] is True, the i-th dimension of the corresponding tensor 
-  // is dynamic changed. Otherwise, the i-th dimension of the tensor is static determined. 
+  // If the dynamic_dims[i] is True, the i-th dimension of the corresponding tensor
+  // is dynamic changed. Otherwise, the i-th dimension of the tensor is static determined.
   repeated bool dynamic_dims = 4;
 
   // This field is used to distinguish vars which are in same process_mesh and in different vpp chunk
@@ -60,16 +60,16 @@ message OperatorDistAttrProto {
   message TensorDistAttrMappingEntryProto {
     optional string name = 1;
     optional TensorDistAttrProto tensor_dist_attr = 2;
-  } 
+  }
   // The key of this map is the input tensor name and the value is the distributed attribute
-  // of the input tensor required by this corresponding operator.   
-  // The distributed attribute of the actual tensor may be not the same as that within 
+  // of the input tensor required by this corresponding operator.
+  // The distributed attribute of the actual tensor may be not the same as that within
   // the distributed attribute of the operator.
   repeated TensorDistAttrMappingEntryProto input_dist_attrs = 1;
 
   // The key of this map is the output tensor name and the value is the distributed attribute
-  // of the output tensor required by this corresponding operator.   
-  // The distributed attribute of the actual tensor may be not the same as that within 
+  // of the output tensor required by this corresponding operator.
+  // The distributed attribute of the actual tensor may be not the same as that within
   // the distributed attribute of the operator.
   repeated TensorDistAttrMappingEntryProto output_dist_attrs = 2;
 
@@ -81,7 +81,7 @@ message OperatorDistAttrProto {
   // may shared the same distributed operator, the field is use for this scenario.
   optional string impl_type = 4;
 
-  // This field tells which distributed implementations of this corresponding operator 
+  // This field tells which distributed implementations of this corresponding operator
   // will be selected for the actual computation.
   optional int64 impl_idx = 5;
 
@@ -115,13 +115,13 @@ message DeviceProto {
   optional string type = 4;
 
   // The capability of this device.
-  optional DeviceCapabilityProto capability = 5; 
+  optional DeviceCapabilityProto capability = 5;
 }
 
-// This proto describes the capability of the link between two devices.        
-message LinkCapabilityProto {        
-  optional int64 bandwidth = 1; // Bytes/s       
-  optional int64 latency = 2;        
+// This proto describes the capability of the link between two devices.
+message LinkCapabilityProto {
+  optional int64 bandwidth = 1; // Bytes/s
+  optional int64 latency = 2;
 }
 
 message LinkProto {
@@ -133,14 +133,14 @@ message LinkProto {
 
   // Represent the link type.
   optional string type = 3;
-      
+
   // The capability of this link.
-  optional LinkCapabilityProto capability = 4; 
+  optional LinkCapabilityProto capability = 4;
 }
 
 // DeviceMesh is used to organize devices and like n-dimension array.
 message DeviceMeshProto {
-  // The global id of this mesh. 
+  // The global id of this mesh.
   optional string name = 1;
 
   // The size of each dimension.
@@ -150,13 +150,13 @@ message DeviceMeshProto {
   // There are no duplicate device ids within one device mesh.
   repeated int64 device_ids = 3;
 
-  // The name of each dimension. 
+  // The name of each dimension.
   repeated string dim_names = 4;
 
   // The devices of this mesh.
   repeated DeviceProto devices = 5;
 
-  // The links are between devices. 
+  // The links are between devices.
   repeated LinkProto links = 6;
 }
 
diff --git a/paddle/phi/core/distributed/auto_parallel/dist_attr.cc b/paddle/phi/core/distributed/auto_parallel/dist_attr.cc
index 62fbd97c46ab2..98dfa339589a5 100644
--- a/paddle/phi/core/distributed/auto_parallel/dist_attr.cc
+++ b/paddle/phi/core/distributed/auto_parallel/dist_attr.cc
@@ -21,8 +21,7 @@ limitations under the License. */
 #include "glog/logging.h"
 #include "paddle/phi/core/distributed/auto_parallel/proto_helper.h"
 
-namespace phi {
-namespace distributed {
+namespace phi::distributed {
 using phi::distributed::auto_parallel::str_join;
 using phi::distributed::auto_parallel::TensorDistAttrProto;
 
@@ -450,5 +449,4 @@ bool TensorDistAttr::is_partial(int64_t mesh_axis) const {
 
 void TensorDistAttr::set_skip_check_mesh(bool skip) { skip_check_mesh_ = skip; }
 
-}  // namespace distributed
-}  // namespace phi
+}  // namespace phi::distributed
diff --git a/paddle/phi/core/distributed/auto_parallel/proto_helper.cc b/paddle/phi/core/distributed/auto_parallel/proto_helper.cc
index fad63c15d63bd..d1cd5b36a5f73 100644
--- a/paddle/phi/core/distributed/auto_parallel/proto_helper.cc
+++ b/paddle/phi/core/distributed/auto_parallel/proto_helper.cc
@@ -23,8 +23,7 @@
   object.to_proto(&proto);                  \
   return proto
 
-namespace phi {
-namespace distributed {
+namespace phi::distributed {
 
 auto_parallel::TensorDistAttrProto to_proto(const TensorDistAttr& dist_attr) {
   TO_PROTO_HELPER(dist_attr, auto_parallel::TensorDistAttrProto);
@@ -61,5 +60,4 @@ auto_parallel::DistributedMapperProto to_proto(
     const auto_parallel::DistributedMapper& dist_mapper) {
   TO_PROTO_HELPER(dist_mapper, auto_parallel::DistributedMapperProto);
 }
-}  // namespace distributed
-}  // namespace phi
+}  // namespace phi::distributed
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard/nd_mesh_reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/reshard/nd_mesh_reshard_function.cc
index 222e918ae540b..d2498c23e6eb7 100644
--- a/paddle/phi/core/distributed/auto_parallel/reshard/nd_mesh_reshard_function.cc
+++ b/paddle/phi/core/distributed/auto_parallel/reshard/nd_mesh_reshard_function.cc
@@ -27,8 +27,7 @@
 #include "paddle/phi/core/distributed/auto_parallel/reshard/same_status_reshard_function.h"
 #include "paddle/phi/core/distributed/store/store_utils.h"
 
-namespace phi {
-namespace distributed {
+namespace phi::distributed {
 
 namespace {
 ProcessMesh GetSubProcessMesh(const ProcessMesh& mesh, int64_t axis) {
@@ -326,5 +325,4 @@ void CrossNdMeshReshardFunction::Eval(DeviceContext* dev_ctx,
   same_status_func.Eval(dev_ctx, tmp_result, out_dist_attr, out);
 }
 
-}  // namespace distributed
-}  // namespace phi
+}  // namespace phi::distributed
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard/p_to_r_reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/reshard/p_to_r_reshard_function.cc
index 1f8bb57293a45..91856cf8e928a 100644
--- a/paddle/phi/core/distributed/auto_parallel/reshard/p_to_r_reshard_function.cc
+++ b/paddle/phi/core/distributed/auto_parallel/reshard/p_to_r_reshard_function.cc
@@ -25,8 +25,7 @@
 #include "paddle/phi/kernels/elementwise_divide_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
 
-namespace phi {
-namespace distributed {
+namespace phi::distributed {
 
 bool PToRReshardFunction::IsSuitable(const DistTensor& in,
                                      const TensorDistAttr& out_dist_attr) {
@@ -142,5 +141,4 @@ void PToRReshardFunctionCrossMesh::Eval(phi::DeviceContext* dev_ctx,
   }
 }
 
-}  // namespace distributed
-}  // namespace phi
+}  // namespace phi::distributed
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard/p_to_s_reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/reshard/p_to_s_reshard_function.cc
index faedf01bab140..64757505ac868 100644
--- a/paddle/phi/core/distributed/auto_parallel/reshard/p_to_s_reshard_function.cc
+++ b/paddle/phi/core/distributed/auto_parallel/reshard/p_to_s_reshard_function.cc
@@ -26,8 +26,7 @@
 #include "paddle/phi/kernels/split_kernel.h"
 #include "paddle/phi/kernels/transpose_kernel.h"
 
-namespace phi {
-namespace distributed {
+namespace phi::distributed {
 
 bool PToSReshardFunction::IsSuitable(const DistTensor& in,
                                      const TensorDistAttr& out_dist_attr) {
@@ -231,5 +230,4 @@ void PToSReshardFunctionCrossMesh::Eval(DeviceContext* dev_ctx,
   same_status_func.Eval(dev_ctx, tmp_result, out_dist_attr, out);
 }
 
-}  // namespace distributed
-}  // namespace phi
+}  // namespace phi::distributed
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard/r_to_s_reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/reshard/r_to_s_reshard_function.cc
index ef7208caf34bb..dafdfa48cb800 100644
--- a/paddle/phi/core/distributed/auto_parallel/reshard/r_to_s_reshard_function.cc
+++ b/paddle/phi/core/distributed/auto_parallel/reshard/r_to_s_reshard_function.cc
@@ -22,8 +22,7 @@
 #include "paddle/phi/core/distributed/store/store_utils.h"
 #include "paddle/phi/kernels/split_kernel.h"
 
-namespace phi {
-namespace distributed {
+namespace phi::distributed {
 
 bool RToSReshardFunction::IsSuitable(const DistTensor& in,
                                      const TensorDistAttr& out_dist_attr) {
@@ -141,5 +140,4 @@ void RToSReshardFunctionCrossMesh::Eval(phi::DeviceContext* dev_ctx,
   same_status_func.Eval(dev_ctx, tmp_result, out_dist_attr, out);
 }
 
-}  // namespace distributed
-}  // namespace phi
+}  // namespace phi::distributed
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard/r_to_x_reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/reshard/r_to_x_reshard_function.cc
index 396d7fdba8deb..ee179c268bff6 100644
--- a/paddle/phi/core/distributed/auto_parallel/reshard/r_to_x_reshard_function.cc
+++ b/paddle/phi/core/distributed/auto_parallel/reshard/r_to_x_reshard_function.cc
@@ -27,8 +27,7 @@
 #include "paddle/phi/kernels/p_send_kernel.h"
 #include "paddle/phi/kernels/split_kernel.h"
 
-namespace phi {
-namespace distributed {
+namespace phi::distributed {
 
 bool RToXExpandReshardFunction::IsSuitable(
     const DistTensor& in, const TensorDistAttr& out_dist_attr) {
@@ -134,5 +133,4 @@ void RToXExpandReshardFunction::Eval(phi::DeviceContext* dev_ctx,
   }
 }
 
-}  // namespace distributed
-}  // namespace phi
+}  // namespace phi::distributed
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard/reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/reshard/reshard_function.cc
index 99da6feb54eba..400627b0e1737 100644
--- a/paddle/phi/core/distributed/auto_parallel/reshard/reshard_function.cc
+++ b/paddle/phi/core/distributed/auto_parallel/reshard/reshard_function.cc
@@ -20,8 +20,7 @@
 #include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
 #include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"
 
-namespace phi {
-namespace distributed {
+namespace phi::distributed {
 
 using phi::distributed::auto_parallel::str_join;
 
@@ -74,5 +73,4 @@ DenseTensor* ReshardFunction::GetMutableTensor(DistTensor* tensor) {
   return tensor->value_.get();
 }
 
-}  // namespace distributed
-}  // namespace phi
+}  // namespace phi::distributed
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.cc b/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.cc
index 01fbaf99c3c15..88b23e294c339 100644
--- a/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.cc
+++ b/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.cc
@@ -23,8 +23,7 @@
 #include "paddle/phi/core/distributed/store/store_utils.h"
 #include "paddle/phi/core/enforce.h"
 
-namespace phi {
-namespace distributed {
+namespace phi::distributed {
 
 namespace {
 std::string GenUniqueCommKey(const std::vector<int64_t>& process_ids) {
@@ -260,5 +259,4 @@ bool IsSubMesh(const ProcessMesh& global_mesh, const ProcessMesh& sub_mesh) {
   return false;
 }
 
-}  // namespace distributed
-}  // namespace phi
+}  // namespace phi::distributed
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard/s_to_r_reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/reshard/s_to_r_reshard_function.cc
index dbfbf1df8d284..8b3a1b32808af 100644
--- a/paddle/phi/core/distributed/auto_parallel/reshard/s_to_r_reshard_function.cc
+++ b/paddle/phi/core/distributed/auto_parallel/reshard/s_to_r_reshard_function.cc
@@ -26,8 +26,7 @@
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/split_kernel.h"
 
-namespace phi {
-namespace distributed {
+namespace phi::distributed {
 
 namespace {
 
@@ -221,5 +220,4 @@ void SToRReshardFunctionCrossMesh::Eval(DeviceContext* dev_ctx,
   }
 }
 
-}  // namespace distributed
-}  // namespace phi
+}  // namespace phi::distributed
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard/x_to_r_reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/reshard/x_to_r_reshard_function.cc
index bd415480d64e9..947a4b77f6961 100644
--- a/paddle/phi/core/distributed/auto_parallel/reshard/x_to_r_reshard_function.cc
+++ b/paddle/phi/core/distributed/auto_parallel/reshard/x_to_r_reshard_function.cc
@@ -25,8 +25,7 @@
 #include "paddle/phi/kernels/p_recv_kernel.h"
 #include "paddle/phi/kernels/p_send_kernel.h"
 
-namespace phi {
-namespace distributed {
+namespace phi::distributed {
 
 bool XToRShrinkReshardFunction::IsSuitable(
     const DistTensor& in, const TensorDistAttr& out_dist_attr) {
@@ -130,5 +129,4 @@ void XToRShrinkReshardFunction::Eval(phi::DeviceContext* dev_ctx,
   }
 }
 
-}  // namespace distributed
-}  // namespace phi
+}  // namespace phi::distributed
diff --git a/paddle/phi/core/distributed/check/nccl_dynamic_check.cc b/paddle/phi/core/distributed/check/nccl_dynamic_check.cc
index 9307af45bd622..6c5fa3b81dd08 100644
--- a/paddle/phi/core/distributed/check/nccl_dynamic_check.cc
+++ b/paddle/phi/core/distributed/check/nccl_dynamic_check.cc
@@ -42,8 +42,7 @@
 #define gpuFree cudaFree
 #endif
 
-namespace phi {
-namespace distributed {
+namespace phi::distributed {
 void NCCLDynamicCheck::CheckDataType(const phi::DenseTensor& tensor,
                                      int64_t dtype) {
   PADDLE_ENFORCE_EQ(
@@ -197,5 +196,4 @@ void NCCLDynamicCheck::CheckGatherShape(
     }
   }
 }
-}  //  namespace distributed
-}  // namespace phi
+}  // namespace phi::distributed
diff --git a/paddle/phi/core/distributed/check/static_check.cc b/paddle/phi/core/distributed/check/static_check.cc
index 25cdc8d01262e..16504f23e2a10 100644
--- a/paddle/phi/core/distributed/check/static_check.cc
+++ b/paddle/phi/core/distributed/check/static_check.cc
@@ -21,8 +21,7 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
 
-namespace phi {
-namespace distributed {
+namespace phi::distributed {
 
 void CommStaticCheck::CheckRank(int rank, int world_size) {
   PADDLE_ENFORCE_GE(rank,
@@ -163,5 +162,4 @@ void CommStaticCheck::GatherLikeShape(const phi::DenseTensor& out_tensor,
              place);
 }
 
-}  //  namespace distributed
-}  // namespace phi
+}  // namespace phi::distributed
diff --git a/paddle/phi/core/distributed/comm_context_manager.cc b/paddle/phi/core/distributed/comm_context_manager.cc
index 4c8f500c406f4..ddd9c70001c16 100644
--- a/paddle/phi/core/distributed/comm_context_manager.cc
+++ b/paddle/phi/core/distributed/comm_context_manager.cc
@@ -43,8 +43,7 @@
 #include "paddle/phi/core/distributed/xccl_comm_context.h"
 #endif
 
-namespace phi {
-namespace distributed {
+namespace phi::distributed {
 
 int CommContextManager::device_id = -1;
 
@@ -295,5 +294,4 @@ std::vector<int> CommContextManager::GetGroupRanks(
   return pg_key_ranks_.at(pg_key);
 }
 
-}  // namespace distributed
-}  // namespace phi
+}  // namespace phi::distributed
diff --git a/paddle/phi/core/distributed/gloo_utils.cc b/paddle/phi/core/distributed/gloo_utils.cc
index ed9ec67710618..d8d86f25fa6c7 100644
--- a/paddle/phi/core/distributed/gloo_utils.cc
+++ b/paddle/phi/core/distributed/gloo_utils.cc
@@ -31,8 +31,7 @@
 #include "paddle/phi/core/distributed/store/tcp_utils.h"
 #include "paddle/phi/core/enforce.h"
 
-namespace phi {
-namespace distributed {
+namespace phi::distributed {
 std::shared_ptr<gloo::transport::Device> CreateDeviceForInterface(
     const std::string& ifname) {
   gloo::transport::tcp::attr attr;
@@ -106,5 +105,4 @@ void send_recv(SendRecvOptions* opts) {
   }
 }
 
-}  // namespace distributed
-}  // namespace phi
+}  // namespace phi::distributed
diff --git a/paddle/phi/core/distributed/nccl_comm_context.cc b/paddle/phi/core/distributed/nccl_comm_context.cc
index 31740d95b7b24..a3095a2ba5f5c 100644
--- a/paddle/phi/core/distributed/nccl_comm_context.cc
+++ b/paddle/phi/core/distributed/nccl_comm_context.cc
@@ -24,8 +24,7 @@
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/utils/data_type.h"
 
-namespace phi {
-namespace distributed {
+namespace phi::distributed {
 
 // set this flag to `true` and recompile to enable dynamic checks
 constexpr bool FLAGS_enable_nccl_dynamic_check = false;
@@ -251,5 +250,4 @@ void NCCLCommContext::RedOpDestroy(ncclRedOp_t op) {
 }
 #endif
 
-}  // namespace distributed
-}  // namespace phi
+}  // namespace phi::distributed
diff --git a/paddle/phi/core/distributed/nccl_comm_task.cc b/paddle/phi/core/distributed/nccl_comm_task.cc
index c94e7f0a02db0..0cc32e0ad04b3 100644
--- a/paddle/phi/core/distributed/nccl_comm_task.cc
+++ b/paddle/phi/core/distributed/nccl_comm_task.cc
@@ -21,8 +21,7 @@
 #include "paddle/phi/core/distributed/nccl_tools.h"
 #include "paddle/phi/core/utils/data_type.h"
 
-namespace phi {
-namespace distributed {
+namespace phi::distributed {
 
 NCCLCommTask::NCCLCommTask(const phi::Place& place,
                            const std::string& group_key,
@@ -266,5 +265,4 @@ std::string NCCLCommTask::GetTraceMsg() {
          ",nranks:" + std::to_string(size_);
 }
 
-}  // namespace distributed
-}  // namespace phi
+}  // namespace phi::distributed
diff --git a/paddle/phi/core/infermeta_utils.h b/paddle/phi/core/infermeta_utils.h
index 494fe160696ff..e63ced99ec539 100644
--- a/paddle/phi/core/infermeta_utils.h
+++ b/paddle/phi/core/infermeta_utils.h
@@ -49,6 +49,8 @@ class InferMetaContext {
   void EmplaceBackOutputs(
       paddle::small_vector<MetaTensor, phi::kOutputSmallVectorSize> outputs);
 
+  void UpdataInput(size_t idx, MetaTensor input) { inputs_[idx] = input; }
+
   TEST_API virtual const MetaTensor& InputAt(size_t idx) const;
 
   TEST_API virtual std::vector<const MetaTensor*> InputsBetween(
@@ -68,6 +70,10 @@ class InferMetaContext {
   const std::pair<int, int>& InputRangeAt(size_t idx) const;
   TEST_API const std::pair<int, int>& OutputRangeAt(size_t idx) const;
 
+  size_t InputsSize() const { return inputs_.size(); }
+  size_t OutputsSize() const { return outputs_.size(); }
+  size_t AttrsSize() const { return attrs_.size(); }
+
   virtual ~InferMetaContext() = default;
 
  protected:
diff --git a/paddle/phi/core/kernel_context.h b/paddle/phi/core/kernel_context.h
index 947af3af1d089..5fa75214fcfb5 100644
--- a/paddle/phi/core/kernel_context.h
+++ b/paddle/phi/core/kernel_context.h
@@ -75,6 +75,10 @@ class KernelContext {
 
   void AssignOutputRange(std::pair<int, int>&& range, size_t idx);
 
+  void UpdataInput(size_t idx, const TensorBase* input) {
+    inputs_[idx] = input;
+  }
+
   template <typename TensorType>
   const TensorType& InputAt(size_t idx) const {
     return static_cast<const TensorType&>(*(inputs_.at(idx)));
diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc
index e16ec77a3b0e1..4d1d37b541f09 100644
--- a/paddle/phi/infermeta/backward.cc
+++ b/paddle/phi/infermeta/backward.cc
@@ -128,7 +128,7 @@ void ChannelShuffleGradInferMeta(const MetaTensor& out_grad,
                         "Input should be a 4-D tensor of format [N, C, H, W] "
                         "or [N, H, W, C], but got %u.",
                         do_dims.size()));
-  auto dx_dims = do_dims;
+  const auto& dx_dims = do_dims;
   x_grad->set_dims(dx_dims);
   x_grad->set_dtype(out_grad.dtype());
 }
@@ -445,6 +445,34 @@ void CudnnLSTMGradInferMeta(
   }
 }
 
+void LSTMGradInferMeta(const MetaTensor& input,
+                       const MetaTensor& h0,
+                       const MetaTensor& c0,
+                       const MetaTensor& weight,
+                       const MetaTensor& bias,
+                       MetaTensor* input_grad,
+                       MetaTensor* h0_grad,
+                       MetaTensor* c0_grad,
+                       MetaTensor* weight_grad,
+                       MetaTensor* bias_grad,
+                       MetaConfig config) {
+  if (input_grad) {
+    input_grad->share_meta(input);
+  }
+  if (h0_grad) {
+    h0_grad->share_meta(h0);
+  }
+  if (c0_grad) {
+    c0_grad->share_meta(c0);
+  }
+  if (weight_grad) {
+    weight_grad->share_meta(weight);
+  }
+  if (bias_grad) {
+    bias_grad->share_meta(bias);
+  }
+}
+
 void DeformableConvGradInferMeta(const MetaTensor& x,
                                  const MetaTensor& offset,
                                  const MetaTensor& filter,
diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h
index e9971b5042ac0..89795c008d34d 100644
--- a/paddle/phi/infermeta/backward.h
+++ b/paddle/phi/infermeta/backward.h
@@ -167,6 +167,18 @@ void CudnnLSTMGradInferMeta(
     MetaTensor* init_c_grad,
     std::vector<MetaTensor*> weight_list_grad);
 
+void LSTMGradInferMeta(const MetaTensor& input,
+                       const MetaTensor& h0,
+                       const MetaTensor& c0,
+                       const MetaTensor& weight,
+                       const MetaTensor& bias,
+                       MetaTensor* input_grad,
+                       MetaTensor* h0_grad,
+                       MetaTensor* c0_grad,
+                       MetaTensor* weight_grad,
+                       MetaTensor* bias_grad,
+                       MetaConfig config = MetaConfig());
+
 void DeformableConvGradInferMeta(const MetaTensor& x,
                                  const MetaTensor& offset,
                                  const MetaTensor& filter,
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index 3c3ef874854ab..aa4028efa1a6e 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -3573,6 +3573,45 @@ void TakeAlongAxisInferMeta(const MetaTensor& x,
   out->set_dtype(x.dtype());
 }
 
+void TdmChildInferMeta(const MetaTensor& x,
+                       const MetaTensor& tree_info,
+                       int child_nums,
+                       DataType dtype,
+                       MetaTensor* child,
+                       MetaTensor* leaf_mask) {
+  PADDLE_ENFORCE_GT(
+      child_nums,
+      0,
+      phi::errors::InvalidArgument(
+          "ValueError: The value of the 'child_nums' must greater than 0. "
+          "But received child_nums value = %d, ",
+          child_nums));
+
+  const auto& info_dims = tree_info.dims();
+  const auto& input_dims = x.dims();
+
+  PADDLE_ENFORCE_EQ(
+      info_dims.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "ShapeError: The dimensions of the 'tree info' must be 2. "
+          "But received tree info's dimensions = %d, "
+          "tree info's shape = [%s].",
+          info_dims.size(),
+          info_dims));
+
+  auto output_dims = common::vectorize(input_dims);
+  output_dims.push_back(child_nums);
+  if (child != nullptr) {
+    child->set_dims(common::make_ddim(output_dims));
+    leaf_mask->set_dims(common::make_ddim(output_dims));
+    child->share_lod(x);
+    leaf_mask->share_lod(x);
+    child->set_dtype(x.dtype());
+    leaf_mask->set_dtype(x.dtype());
+  }
+}
+
 void TriangularSolveInferMeta(const MetaTensor& x,
                               const MetaTensor& y,
                               bool upper,
diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
index e166746e3a646..391d01debd7a3 100644
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -635,6 +635,13 @@ void TakeAlongAxisInferMeta(const MetaTensor& x,
                             int axis,
                             MetaTensor* out);
 
+void TdmChildInferMeta(const MetaTensor& x,
+                       const MetaTensor& tree_info,
+                       int child_nums,
+                       DataType dtype,
+                       MetaTensor* child,
+                       MetaTensor* leaf_mask);
+
 void TriangularSolveInferMeta(const MetaTensor& x,
                               const MetaTensor& y,
                               bool upper,
diff --git a/paddle/phi/infermeta/fusion.cc b/paddle/phi/infermeta/fusion.cc
index 9987524d4997d..c243f640446a0 100644
--- a/paddle/phi/infermeta/fusion.cc
+++ b/paddle/phi/infermeta/fusion.cc
@@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
 #include "paddle/phi/kernels/funcs/concat_funcs.h"
+#include "paddle/phi/kernels/funcs/fused_elemwise_activation_functor.h"
 #include "paddle/phi/kernels/funcs/strided_slice.h"
 
 namespace phi {
@@ -377,6 +378,89 @@ void BlockMultiheadAttentionInferMeta(const MetaTensor& qkv,
   }
 }
 
+void BlockMultiheadAttentionInferXPUMeta(
+    const MetaTensor& qkv,
+    const MetaTensor& key_cache,
+    const MetaTensor& value_cache,
+    const MetaTensor& seq_lens_encoder,
+    const MetaTensor& seq_lens_decoder,
+    const MetaTensor& seq_lens_this_time,
+    const MetaTensor& padding_offsets,
+    const MetaTensor& cum_offsets,
+    const MetaTensor& cu_seqlens_q,
+    const MetaTensor& cu_seqlens_k,
+    const MetaTensor& cache_k_per_batch_maxs,
+    const MetaTensor& cache_v_per_batch_maxs,
+    const MetaTensor& block_tables,
+    const MetaTensor& pre_key_cache,
+    const MetaTensor& pre_value_cache,
+    const MetaTensor& rope_emb,
+    const MetaTensor& mask,
+    const MetaTensor& tgt_mask,
+    const MetaTensor& cache_k_quant_scales,
+    const MetaTensor& cache_v_quant_scales,
+    const MetaTensor& cache_k_dequant_scales,
+    const MetaTensor& cache_v_dequant_scales,
+    const MetaTensor& qkv_out_scale,
+    const MetaTensor& qkv_bias,
+    const MetaTensor& out_shift,
+    const MetaTensor& out_smooth,
+    const MetaTensor& max_enc_len_this_time,
+    const MetaTensor& max_dec_len_this_time,
+    int max_seq_len,
+    int block_size,
+    bool use_neox_style,
+    bool dynamic_cachekv_quant,
+    const int quant_round_type,
+    const float quant_max_bound,
+    const float quant_min_bound,
+    const float out_scale,
+    const std::string& compute_dtype,
+    MetaTensor* fmha_out,
+    MetaTensor* qkv_out,
+    MetaTensor* key_cache_out,
+    MetaTensor* value_cache_out) {
+  BlockMultiheadAttentionInferMeta(qkv,
+                                   key_cache,
+                                   value_cache,
+                                   seq_lens_encoder,
+                                   seq_lens_decoder,
+                                   seq_lens_this_time,
+                                   padding_offsets,
+                                   cum_offsets,
+                                   cu_seqlens_q,
+                                   cu_seqlens_k,
+                                   block_tables,
+                                   pre_key_cache,
+                                   pre_value_cache,
+                                   rope_emb,
+                                   mask,
+                                   tgt_mask,
+                                   cache_k_quant_scales,
+                                   cache_v_quant_scales,
+                                   cache_k_dequant_scales,
+                                   cache_v_dequant_scales,
+                                   qkv_out_scale,
+                                   qkv_bias,
+                                   out_shift,
+                                   out_smooth,
+                                   max_enc_len_this_time,
+                                   max_dec_len_this_time,
+                                   max_seq_len,
+                                   block_size,
+                                   use_neox_style,
+                                   dynamic_cachekv_quant,
+                                   quant_round_type,
+                                   quant_max_bound,
+                                   quant_min_bound,
+                                   out_scale,
+                                   compute_dtype,
+                                   fmha_out,
+                                   qkv_out,
+                                   key_cache_out,
+                                   value_cache_out);
+}
+
 void Conv1dXPUInferMeta(const MetaTensor& x,
                         const MetaTensor& x_max,
                         const MetaTensor& filter,
@@ -4483,4 +4567,120 @@ void FusedTokenPruneInferMeta(const MetaTensor& attn,
   cls_inds->set_dtype(DataType::INT64);
 }
 
+void FusedElemwiseActivationInferMeta(
+    const MetaTensor& x,
+    const MetaTensor& y,
+    const std::vector<std::string>& functor_list,
+    int axis,
+    float scale,
+    bool save_intermediate_out,
+    MetaTensor* out,
+    MetaTensor* intermediate_out,
+    MetaConfig config) {
+  const auto& x_dim = x.dims();
+  const auto& y_dim = y.dims();
+
+  // Whether the shape of Y is a continuous subsequence of X,
+  // For more information please refer to the op's introduction.
+  bool bcast_y = phi::funcs::IsBcastY(x_dim, y_dim);
+
+  const auto& out_dim = bcast_y ? x_dim : y_dim;
+  const auto& out_lod = bcast_y ? x : y;
+  auto out_dtype = bcast_y ? x.dtype() : y.dtype();
+
+  if (save_intermediate_out) {
+    PADDLE_ENFORCE_EQ(
+        intermediate_out->initialized(),
+        true,
+        phi::errors::InvalidArgument(
+            "Output(IntermediateOut) of FusedElemwiseActivationOp "
+            "should not be null."));
+
+    if (phi::funcs::IsUnaryCompound(functor_list)) {
+      // for Unary(Binary(X, Y)), the shape and lod of out and
+      // intermediate_out are the same.
+      intermediate_out->set_dims(out_dim);
+      // set the lod of intermediate_out
+      intermediate_out->share_lod(out_lod);
+      intermediate_out->set_dtype(out_dtype);
+    } else {
+      // for Binary(X, Unary(Y)), the shape and lod of Y and
+      // intermediate_out are the same.
+      intermediate_out->set_dims(y_dim);
+      // set the lod of intermediate_out
+      intermediate_out->share_lod(y);
+      intermediate_out->set_dtype(y.dtype());
+    }
+  }
+  out->set_dims(out_dim);
+  out->share_lod(out_lod);
+  out->set_dtype(out_dtype);
+}
+
+void FusedElemwiseActivationGradInferMeta(
+    const MetaTensor& x,
+    const MetaTensor& y,
+    const MetaTensor& out,
+    const MetaTensor& intermediate_out,
+    const MetaTensor& out_grad,
+    const std::vector<std::string>& functor_list,
+    int axis,
+    float scale,
+    bool save_intermediate_out,
+    MetaTensor* x_grad,
+    MetaTensor* y_grad,
+    MetaConfig config) {
+  PADDLE_ENFORCE_EQ(
+      out_grad.initialized(),
+      true,
+      phi::errors::InvalidArgument("Input(Out@Grad) should not be null."));
+
+  if (save_intermediate_out) {
+    PADDLE_ENFORCE_EQ(intermediate_out.initialized(),
+                      true,
+                      phi::errors::InvalidArgument(
+                          "Input(IntermediateOut) should not be null."));
+  } else {
+    if (!phi::funcs::InputXCanBeAbsent(functor_list)) {
+      PADDLE_ENFORCE_EQ(
+          x.initialized(),
+          true,
+          phi::errors::InvalidArgument("Input(X) should not be null."));
+    }
+  }
+
+  if (x_grad != nullptr) {
+    if (x.initialized()) {
+      x_grad->set_dims(x.dims());
+      x_grad->share_lod(x);
+      x_grad->set_dtype(x.dtype());
+    } else {
+      // Currently, only when Binary is elementwise_add or elementwise_sub,
+      // the "X" could be absent.
+      PADDLE_ENFORCE_EQ(
+          phi::funcs::InputXCanBeAbsent(functor_list),
+          true,
+          phi::errors::InvalidArgument(
+              "Only when BinaryFunctor is elementwise_add, the 'X' "
+              "could be absent."));
+
+      // Node: If "X" is absence, the shape of Y should be a continuous
+      // subsequence of X, otherwise, we could not infer the shape of dx.
+      x_grad->set_dims(out_grad.dims());
+      x_grad->share_lod(out_grad);
+      x_grad->set_dtype(out_grad.dtype());
+    }
+  }
+
+  if (y_grad != nullptr) {
+    PADDLE_ENFORCE_EQ(
+        y.initialized(),
+        true,
+        phi::errors::InvalidArgument("Input(Y) should not be null."));
+    y_grad->set_dims(y.dims());
+    y_grad->share_lod(y);
+    y_grad->set_dtype(y.dtype());
+  }
+}
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/fusion.h b/paddle/phi/infermeta/fusion.h
index aa48f64434ee3..528ce5dff8b7e 100644
--- a/paddle/phi/infermeta/fusion.h
+++ b/paddle/phi/infermeta/fusion.h
@@ -128,6 +128,49 @@ void BlockMultiheadAttentionInferMeta(const MetaTensor& qkv,
                                       MetaTensor* key_cache_out,
                                       MetaTensor* value_cache_out);
 
+void BlockMultiheadAttentionInferXPUMeta(
+    const MetaTensor& qkv,
+    const MetaTensor& key_cache,
+    const MetaTensor& value_cache,
+    const MetaTensor& seq_lens_encoder,
+    const MetaTensor& seq_lens_decoder,
+    const MetaTensor& seq_lens_this_time,
+    const MetaTensor& padding_offsets,
+    const MetaTensor& cum_offsets,
+    const MetaTensor& cu_seqlens_q,
+    const MetaTensor& cu_seqlens_k,
+    const MetaTensor& cache_k_per_batch_maxs,
+    const MetaTensor& cache_v_per_batch_maxs,
+    const MetaTensor& block_tables,
+    const MetaTensor& pre_key_cache,
+    const MetaTensor& pre_value_cache,
+    const MetaTensor& rope_emb,
+    const MetaTensor& mask,
+    const MetaTensor& tgt_mask,
+    const MetaTensor& cache_k_quant_scales,
+    const MetaTensor& cache_v_quant_scales,
+    const MetaTensor& cache_k_dequant_scales,
+    const MetaTensor& cache_v_dequant_scales,
+    const MetaTensor& qkv_out_scale,
+    const MetaTensor& qkv_bias,
+    const MetaTensor& out_shift,
+    const MetaTensor& out_smooth,
+    const MetaTensor& max_enc_len_this_time,
+    const MetaTensor& max_dec_len_this_time,
+    int max_seq_len,
+    int block_size,
+    bool use_neox_style,
+    bool dynamic_cachekv_quant,
+    const int quant_round_type,
+    const float quant_max_bound,
+    const float quant_min_bound,
+    const float out_scale,
+    const std::string& compute_dtype,
+    MetaTensor* fmha_out,
+    MetaTensor* qkv_out,
+    MetaTensor* key_cache_out,
+    MetaTensor* value_cache_out);
+
 void Conv1dXPUInferMeta(const MetaTensor& x,
                         const MetaTensor& x_max,
                         const MetaTensor& filter,
@@ -1006,4 +1049,29 @@ void FusedTokenPruneInferMeta(const MetaTensor& attn,
                               MetaTensor* slimmed_x,
                               MetaTensor* cls_inds);
 
+void FusedElemwiseActivationInferMeta(
+    const MetaTensor& x,
+    const MetaTensor& y,
+    const std::vector<std::string>& functor_list,
+    int axis,
+    float scale,
+    bool save_intermediate_out,
+    MetaTensor* out,
+    MetaTensor* intermediate_out,
+    MetaConfig config = MetaConfig());
+
+void FusedElemwiseActivationGradInferMeta(
+    const MetaTensor& x,
+    const MetaTensor& y,
+    const MetaTensor& out,
+    const MetaTensor& intermediate_out,
+    const MetaTensor& out_grad,
+    const std::vector<std::string>& functor_list,
+    int axis,
+    float scale,
+    bool save_intermediate_out,
+    MetaTensor* x_grad,
+    MetaTensor* y_grad,
+    MetaConfig config = MetaConfig());
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index a80997970f8fb..9b7eaed7c3bf3 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "glog/logging.h"
 
 #include "paddle/common/layout.h"
-#include "paddle/phi/backends/device_memory_aligment.h"
+#include "paddle/phi/backends/device_memory_alignment.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/infermeta_utils.h"
@@ -1349,6 +1349,115 @@ void CudnnLSTMInferMeta(
   state_out->set_dtype(phi::DataType::UINT8);
 }
 
+void LSTMInferMeta(const MetaTensor& input,
+                   const MetaTensor& h0,
+                   const MetaTensor& c0,
+                   const MetaTensor& weight,
+                   const MetaTensor& bias,
+                   bool use_peepholes,
+                   bool is_reverse,
+                   bool is_test,
+                   const std::string& gate_activation,
+                   const std::string& cell_activation,
+                   const std::string& candidate_activation,
+                   MetaTensor* hidden,
+                   MetaTensor* cell,
+                   MetaTensor* batch_gate,
+                   MetaTensor* batch_cell_pre_act,
+                   MetaConfig config) {
+  const auto& in_dims = input.dims();
+  PADDLE_ENFORCE_EQ(
+      in_dims.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "Input(X)'s rank must be 2, but received %d.", in_dims.size()));
+
+  if (h0) {
+    PADDLE_ENFORCE_EQ(
+        c0.initialized(),
+        true,
+        phi::errors::NotFound("Input(Cell) and Input(Hidden) of LSTM "
+                              "should not be null at the same time."));
+    const auto& h_dims = h0.dims();
+    const auto& c_dims = c0.dims();
+    PADDLE_ENFORCE_EQ(h_dims,
+                      c_dims,
+                      phi::errors::InvalidArgument(
+                          "The dimension of Input(H0) and Input(C0) should "
+                          "be the same, but received [%s] (H0) vs [%s] (C0).",
+                          h_dims,
+                          c_dims));
+  }
+
+  int frame_size = static_cast<int>(in_dims[1] / 4);
+  const auto& w_dims = weight.dims();
+  PADDLE_ENFORCE_EQ(
+      w_dims.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "The rank of Input(Weight) should be 2, but received %d.",
+          w_dims.size()));
+  PADDLE_ENFORCE_EQ(w_dims[0],
+                    frame_size,
+                    phi::errors::InvalidArgument(
+                        "The first dimension of Input(Weight) should be %d, "
+                        "but received %d.",
+                        frame_size,
+                        w_dims[0]));
+  PADDLE_ENFORCE_EQ(w_dims[1],
+                    4 * frame_size,
+                    phi::errors::InvalidArgument(
+                        "The second dimension of Input(Weight) should be 4 * "
+                        "%d, but received %d.",
+                        frame_size,
+                        w_dims[1]));
+
+  const auto& b_dims = bias.dims();
+  PADDLE_ENFORCE_EQ(b_dims.size(),
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The rank of Input(Bias) should be 2, but received %d.",
+                        b_dims.size()));
+  PADDLE_ENFORCE_EQ(
+      b_dims[0],
+      1,
+      phi::errors::InvalidArgument(
+          "The first dimension of Input(Bias) should be 1, but received %d.",
+          b_dims[0]));
+
+  if (use_peepholes) {
+    PADDLE_ENFORCE_EQ(
+        b_dims[1],
+        7 * frame_size,
+        phi::errors::InvalidArgument(
+            "The second dimension of Input(Bias) should be 7 * %d if enable "
+            "peepholes connection, but received %d.",
+            frame_size,
+            b_dims[1]));
+  } else {
+    PADDLE_ENFORCE_EQ(
+        b_dims[1],
+        4 * frame_size,
+        phi::errors::InvalidArgument(
+            "The second dimension of Input(Bias) should be 4 * %d if disable "
+            "peepholes connection, but received %d.",
+            frame_size,
+            b_dims[1]));
+  }
+
+  phi::DDim out_dims({in_dims[0], frame_size});
+  hidden->set_dims(out_dims);
+  cell->set_dims(out_dims);
+  if (!is_test) {
+    batch_gate->set_dims(in_dims);
+    batch_cell_pre_act->set_dims(out_dims);
+  }
+  hidden->share_lod(input);
+  cell->share_lod(input);
+  hidden->set_dtype(input.dtype());
+  cell->set_dtype(input.dtype());
+}
+
 void DecayedAdagradInferMeta(const MetaTensor& param,
                              const MetaTensor& grad,
                              const MetaTensor& moment,
diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
index 56dff7422b2cc..a73212505f669 100644
--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -292,6 +292,23 @@ void CudnnLSTMInferMeta(
     MetaTensor* reserve,
     MetaTensor* state_out);
 
+void LSTMInferMeta(const MetaTensor& input,
+                   const MetaTensor& h0,
+                   const MetaTensor& c0,
+                   const MetaTensor& weight,
+                   const MetaTensor& bias,
+                   bool use_peepholes,
+                   bool is_reverse,
+                   bool is_test,
+                   const std::string& gate_activation,
+                   const std::string& cell_activation,
+                   const std::string& candidate_activation,
+                   MetaTensor* hidden,
+                   MetaTensor* cell,
+                   MetaTensor* batch_gate,
+                   MetaTensor* batch_cell_pre_act,
+                   MetaConfig config = MetaConfig());
+
 void DecayedAdagradInferMeta(const MetaTensor& param,
                              const MetaTensor& grad,
                              const MetaTensor& moment,
diff --git a/paddle/phi/infermeta/sparse/unary.cc b/paddle/phi/infermeta/sparse/unary.cc
index f624df0d8c55a..106a4a84474e2 100644
--- a/paddle/phi/infermeta/sparse/unary.cc
+++ b/paddle/phi/infermeta/sparse/unary.cc
@@ -15,8 +15,7 @@ limitations under the License. */
 #include "paddle/phi/infermeta/sparse/unary.h"
 #include "paddle/phi/core/infermeta_utils.h"
 
-namespace phi {
-namespace sparse {
+namespace phi::sparse {
 
 void IndicesInferMeta(const MetaTensor& x, MetaTensor* out) {
   // TODO(zhangkaihuo) Currently, we cannot get sparse_dim from tensor.
@@ -51,5 +50,4 @@ void CastInferMeta(const MetaTensor& x,
   }
 }
 
-}  // namespace sparse
-}  // namespace phi
+}  // namespace phi::sparse
diff --git a/paddle/phi/infermeta/spmd_rules/elementwise.cc b/paddle/phi/infermeta/spmd_rules/elementwise.cc
index 4e12c994b595b..099f7b7d54fd2 100644
--- a/paddle/phi/infermeta/spmd_rules/elementwise.cc
+++ b/paddle/phi/infermeta/spmd_rules/elementwise.cc
@@ -21,8 +21,7 @@ limitations under the License. */
 #include "paddle/phi/core/distributed/auto_parallel/utils.h"
 #include "paddle/phi/infermeta/spmd_rules/utils.h"
 
-namespace phi {
-namespace distributed {
+namespace phi::distributed {
 
 using phi::distributed::auto_parallel::str_join;
 
@@ -531,5 +530,4 @@ SpmdInfo ElementwiseBinaryGradInferSpmd(const DistMetaTensor& x,
   info.first.emplace(info.first.begin() + 2, out_grad.dist_attr());
   return info;
 }
-}  // namespace distributed
-}  // namespace phi
+}  // namespace phi::distributed
diff --git a/paddle/phi/infermeta/spmd_rules/expand_as.cc b/paddle/phi/infermeta/spmd_rules/expand_as.cc
index 6bd663c826664..ea26fe7b54c26 100644
--- a/paddle/phi/infermeta/spmd_rules/expand_as.cc
+++ b/paddle/phi/infermeta/spmd_rules/expand_as.cc
@@ -25,7 +25,7 @@ std::tuple<TensorDistAttr, TensorDistAttr> AlignExpandAsDistAttrs(
   auto x_dist_attr_dst = CopyTensorDistAttrForOutput(x_dist_attr_src);
   auto y_dist_attr_dst = CopyTensorDistAttrForOutput(y_dist_attr_src);
   auto x_dims_mapping_dst = x_dims_mapping_src;
-  auto y_dims_mapping_dst = y_dims_mapping_src;
+  const auto& y_dims_mapping_dst = y_dims_mapping_src;
   int dims_diff = y_ndim - x_ndim;
   for (int i = 0; i < y_ndim; ++i) {
     if (i >= dims_diff) {
diff --git a/paddle/phi/infermeta/spmd_rules/flash_attention.cc b/paddle/phi/infermeta/spmd_rules/flash_attention.cc
index 737ad4eff03c9..cd2cfacad3d37 100644
--- a/paddle/phi/infermeta/spmd_rules/flash_attention.cc
+++ b/paddle/phi/infermeta/spmd_rules/flash_attention.cc
@@ -19,8 +19,7 @@ limitations under the License. */
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/infermeta/spmd_rules/utils.h"
 
-namespace phi {
-namespace distributed {
+namespace phi::distributed {
 const int kNumHeadsDimIndex = 2;
 
 #define LOG_SPMD_INPUT(name)                                                  \
@@ -806,5 +805,4 @@ SpmdInfo FlashAttGradInferSpmd(const DistMetaTensor& q,
           {q_grad, k_grad, v_grad}};
 }
 
-}  // namespace distributed
-}  // namespace phi
+}  // namespace phi::distributed
diff --git a/paddle/phi/infermeta/spmd_rules/flatten.cc b/paddle/phi/infermeta/spmd_rules/flatten.cc
index a0f084b491771..0cff9a46c5656 100644
--- a/paddle/phi/infermeta/spmd_rules/flatten.cc
+++ b/paddle/phi/infermeta/spmd_rules/flatten.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h"
 #include "paddle/phi/core/distributed/auto_parallel/utils.h"
 #include "paddle/phi/infermeta/spmd_rules/dim_trans.h"
+#include "paddle/phi/infermeta/spmd_rules/reshape.h"
 #include "paddle/phi/infermeta/spmd_rules/utils.h"
 
 namespace phi {
@@ -105,41 +106,31 @@ SpmdInfo FlattenInferSpmd(const DistMetaTensor& x,
                                    x_ndim,
                                    x_dims_mapping.size()));
 
-  // Step1: Build the transformation from
-  // the original shape to the target shape
-
+  // obtain target shape and use ReshapeInferSpmdDynamic to infer
   start_axis = PreprocessAxis(start_axis, x_ndim);
   stop_axis = PreprocessAxis(stop_axis, x_ndim);
-  std::vector<std::shared_ptr<DimTrans>> trans =
-      MakeFlattenDimTrans(src_shape, start_axis, stop_axis);
-
-  // Step2: Infer the dims mapping of input (if reshard is
-  // needed) and output from the dimension transformation.
-  std::vector<std::vector<int64_t>> dims_mapping_vec =
-      InferFromDimTrans(x, trans);
-
-  // Step3: Update the dist attributes of input
-  // and output with the inferred dims mapping.
-  TensorDistAttr x_dist_attr_dst = CopyTensorDistAttrForOutput(x_dist_attr_src);
-  x_dist_attr_dst.set_dims_mapping(dims_mapping_vec[0]);
-  TensorDistAttr out_dist_attr = CopyTensorDistAttrForOutput(x_dist_attr_src);
-  out_dist_attr.set_dims_mapping(dims_mapping_vec[1]);
+  std::vector<int64_t> dst_shape;
+  int64_t flatten_size = 1;
+  for (int64_t i = 0; i < x_ndim; i++) {
+    if (i < start_axis || i > stop_axis) {
+      dst_shape.emplace_back(src_shape[i]);
+    } else {
+      flatten_size *= src_shape[i];
+      if (i == stop_axis) {
+        dst_shape.emplace_back(flatten_size);
+      }
+    }
+  }
 
   VLOG(4) << "FlattenInferSpmd: X shape: [" << str_join(src_shape) << "]";
   VLOG(4) << "Start_axis: " << start_axis;
-  VLOG(4) << "Stop_axis: " << start_axis;
-  VLOG(4) << "Transformation from input to output:";
-  for (int64_t i = 0, n = static_cast<int64_t>(trans.size()); i < n; i++) {
-    std::shared_ptr<DimTrans> t = trans[i];
-    VLOG(4) << "\tOut axis[" << i << "]: " << t->to_string();
-  }
-  VLOG(4) << "X dims_mapping_src: [" << str_join(x_dims_mapping)
-          << "] dims_mapping_dst: [" << str_join(dims_mapping_vec[0]) << "]";
-  VLOG(4) << "Out dims_mapping: [" << str_join(dims_mapping_vec[1]) << "]\n\n";
-
-  return {{x_dist_attr_dst}, {out_dist_attr}};
+  VLOG(4) << "Stop_axis: " << stop_axis;
+  VLOG(4) << "FlattenInferSpmd: output shape: [" << str_join(dst_shape) << "]";
+  VLOG(4) << "use ReshapeInferSpmdDynamic to infer distributed attribute";
+  return ReshapeInferSpmdDynamic(x, dst_shape);
 }
 
+// TODO(jeff41404): consider xshape and use ReshapeInferSpmdReverse in future
 SpmdInfo FlattenInferSpmdReverse(const DistMetaTensor& x,
                                  const DistMetaTensor& out,
                                  int start_axis,
@@ -198,5 +189,16 @@ SpmdInfo FlattenInferSpmdReverse(const DistMetaTensor& x,
   return {{x_dist_attr}, {out_dist_attr_dst}};
 }
 
+SpmdInfo FlattenGradInferSpmd(const DistMetaTensor& xshape,
+                              const DistMetaTensor& out_grad) {
+  // TODO(jeff41404): when ReshapeInferSpmd and ReshapeGradInferSpmd can deliver
+  // distributed attribute of xshape, we will use ReshapeGradInferSpmd directly
+  // in future return ReshapeGradInferSpmd(xshape, out_grad);
+  auto shape = phi::vectorize(xshape.dims());
+  shape = std::vector<int64_t>(shape.begin() + 1, shape.end());
+  const auto& spmd = ReshapeInferSpmd(out_grad, shape);
+  return {{xshape.dist_attr(), spmd.first[0]}, {spmd.second[0]}};
+}
+
 }  // namespace distributed
 }  // namespace phi
diff --git a/paddle/phi/infermeta/spmd_rules/flatten.h b/paddle/phi/infermeta/spmd_rules/flatten.h
index bb62d8c0d7b0a..28bf5e56d5256 100644
--- a/paddle/phi/infermeta/spmd_rules/flatten.h
+++ b/paddle/phi/infermeta/spmd_rules/flatten.h
@@ -30,5 +30,8 @@ SpmdInfo FlattenInferSpmdReverse(const DistMetaTensor& x,
                                  const DistMetaTensor& out,
                                  int start_axis,
                                  int stop_axis);
+
+SpmdInfo FlattenGradInferSpmd(const DistMetaTensor& xshape,
+                              const DistMetaTensor& out_grad);
 }  // namespace distributed
 }  // namespace phi
diff --git a/paddle/phi/infermeta/spmd_rules/full_like.cc b/paddle/phi/infermeta/spmd_rules/full_like.cc
index 6089865766b31..0670df21ab153 100644
--- a/paddle/phi/infermeta/spmd_rules/full_like.cc
+++ b/paddle/phi/infermeta/spmd_rules/full_like.cc
@@ -15,8 +15,7 @@ limitations under the License. */
 #include "paddle/phi/infermeta/spmd_rules/full_like.h"
 #include "paddle/phi/infermeta/spmd_rules/elementwise.h"
 
-namespace phi {
-namespace distributed {
+namespace phi::distributed {
 SpmdInfo FullLikeInferSpmd(const DistMetaTensor& x,
                            const Scalar& y,
                            phi::DataType dtype) {
@@ -24,5 +23,4 @@ SpmdInfo FullLikeInferSpmd(const DistMetaTensor& x,
   out_dist_attr.clean_partial_status();
   return {{x.dist_attr()}, {out_dist_attr}};
 }
-}  // namespace distributed
-}  // namespace phi
+}  // namespace phi::distributed
diff --git a/paddle/phi/infermeta/spmd_rules/fused_linear_param_grad_add.cc b/paddle/phi/infermeta/spmd_rules/fused_linear_param_grad_add.cc
index c2afdd86b57a7..295d5fa593da5 100644
--- a/paddle/phi/infermeta/spmd_rules/fused_linear_param_grad_add.cc
+++ b/paddle/phi/infermeta/spmd_rules/fused_linear_param_grad_add.cc
@@ -17,8 +17,7 @@ limitations under the License. */
 #include "paddle/phi/infermeta/spmd_rules/matmul.h"
 #include "paddle/phi/infermeta/spmd_rules/utils.h"
 
-namespace phi {
-namespace distributed {
+namespace phi::distributed {
 
 SpmdInfo FusedLinearParamGradAddInferSpmd(const DistMetaTensor& x,
                                           const DistMetaTensor& dout,
@@ -73,5 +72,4 @@ SpmdInfo FusedLinearParamGradAddInferSpmd(const DistMetaTensor& x,
 
 SpmdInfo FusedLinearParamGradAddInferSpmdFakeReverse() { return SpmdInfo(); }
 
-}  // namespace distributed
-}  // namespace phi
+}  // namespace phi::distributed
diff --git a/paddle/phi/infermeta/spmd_rules/fused_rope.cc b/paddle/phi/infermeta/spmd_rules/fused_rope.cc
index 8099c12aa0e1b..f3aa720d61ece 100644
--- a/paddle/phi/infermeta/spmd_rules/fused_rope.cc
+++ b/paddle/phi/infermeta/spmd_rules/fused_rope.cc
@@ -563,6 +563,7 @@ SpmdInfo FusedRopeGradInferSpmd(const DistMetaTensor& sin,
                                           time_major);
   std::vector<ArgDistAttr> dist_attrs;
   std::vector<int> order = {3, 4, 5, 0, 1, 2};
+  dist_attrs.reserve(order.size());
   for (int ind : order) {
     dist_attrs.emplace_back(spmd_info.first[ind]);
   }
diff --git a/paddle/phi/infermeta/spmd_rules/gather.cc b/paddle/phi/infermeta/spmd_rules/gather.cc
index 30cb413ba1ddf..c2376d8545170 100644
--- a/paddle/phi/infermeta/spmd_rules/gather.cc
+++ b/paddle/phi/infermeta/spmd_rules/gather.cc
@@ -22,8 +22,7 @@ limitations under the License. */
 #include "paddle/phi/infermeta/spmd_rules/spmd_rule_macro_define.h"
 #include "paddle/phi/infermeta/spmd_rules/utils.h"
 
-namespace phi {
-namespace distributed {
+namespace phi::distributed {
 
 using phi::distributed::auto_parallel::str_join;
 
@@ -215,5 +214,4 @@ SpmdInfo GatherGradInferSpmd(const DistMetaTensor& x,
           {x_grad_dist_attr}};
 }
 
-}  // namespace distributed
-}  // namespace phi
+}  // namespace phi::distributed
diff --git a/paddle/phi/infermeta/spmd_rules/numel.cc b/paddle/phi/infermeta/spmd_rules/numel.cc
index ca0678b773163..7dcc78fbe0f50 100644
--- a/paddle/phi/infermeta/spmd_rules/numel.cc
+++ b/paddle/phi/infermeta/spmd_rules/numel.cc
@@ -21,7 +21,6 @@ limitations under the License. */
 
 namespace phi {
 namespace distributed {
-using phi::distributed::auto_parallel::str_join;
 
 SpmdInfo NumelInferSpmd(const DistMetaTensor& x) {
   std::string alphabet = "abcdefghijklmnopqrstuvwxyz";
diff --git a/paddle/phi/infermeta/spmd_rules/one_hot.cc b/paddle/phi/infermeta/spmd_rules/one_hot.cc
index dc90684dde1ef..bc7f0e32ba043 100644
--- a/paddle/phi/infermeta/spmd_rules/one_hot.cc
+++ b/paddle/phi/infermeta/spmd_rules/one_hot.cc
@@ -22,8 +22,7 @@ limitations under the License. */
 #include "paddle/phi/infermeta/spmd_rules/spmd_rule_macro_define.h"
 #include "paddle/phi/infermeta/spmd_rules/utils.h"
 
-namespace phi {
-namespace distributed {
+namespace phi::distributed {
 
 using phi::distributed::auto_parallel::str_join;
 
@@ -90,5 +89,4 @@ SpmdInfo OneHotInferSpmdDynamic(const DistMetaTensor& x,
   return OneHotInferSpmd(x, num_classes.to<int32_t>());
 }
 
-}  // namespace distributed
-}  // namespace phi
+}  // namespace phi::distributed
diff --git a/paddle/phi/infermeta/spmd_rules/pow.cc b/paddle/phi/infermeta/spmd_rules/pow.cc
index 59112010e5998..0a73d5706b1a0 100644
--- a/paddle/phi/infermeta/spmd_rules/pow.cc
+++ b/paddle/phi/infermeta/spmd_rules/pow.cc
@@ -13,8 +13,7 @@ limitations under the License. */
 #include "paddle/phi/infermeta/spmd_rules/pow.h"
 #include "paddle/phi/infermeta/spmd_rules/elementwise.h"
 
-namespace phi {
-namespace distributed {
+namespace phi::distributed {
 SpmdInfo PowInferSpmd(const DistMetaTensor& x, const Scalar& y) {
   return ElementwiseUnaryInferSpmd(x);
 }
@@ -23,5 +22,4 @@ SpmdInfo PowGradInferSpmd(const DistMetaTensor& x,
                           const Scalar y) {
   return ElementwiseUnaryGradInferSpmd(x, out_grad);
 }
-}  // namespace distributed
-}  // namespace phi
+}  // namespace phi::distributed
diff --git a/paddle/phi/infermeta/spmd_rules/slice.cc b/paddle/phi/infermeta/spmd_rules/slice.cc
index cde458df747e2..01587cd4dad12 100644
--- a/paddle/phi/infermeta/spmd_rules/slice.cc
+++ b/paddle/phi/infermeta/spmd_rules/slice.cc
@@ -21,8 +21,7 @@ limitations under the License. */
 #include "paddle/phi/core/distributed/auto_parallel/utils.h"
 #include "paddle/phi/infermeta/spmd_rules/utils.h"
 
-namespace phi {
-namespace distributed {
+namespace phi::distributed {
 
 using phi::distributed::auto_parallel::str_join;
 
@@ -379,5 +378,4 @@ SpmdInfo StridedSliceGradInferSpmdDynamic(const DistMetaTensor& input,
   return SliceGradInferBase(input, out_grad, axes_bridge, {});
 }
 
-}  // namespace distributed
-}  // namespace phi
+}  // namespace phi::distributed
diff --git a/paddle/phi/infermeta/spmd_rules/softmax.cc b/paddle/phi/infermeta/spmd_rules/softmax.cc
index b6f886a49468a..542050f15ef50 100644
--- a/paddle/phi/infermeta/spmd_rules/softmax.cc
+++ b/paddle/phi/infermeta/spmd_rules/softmax.cc
@@ -23,8 +23,7 @@ limitations under the License. */
 #include "paddle/phi/infermeta/spmd_rules/utils.h"
 #include "paddle/phi/infermeta/unary.h"
 
-namespace phi {
-namespace distributed {
+namespace phi::distributed {
 
 using phi::distributed::auto_parallel::str_join;
 
@@ -206,5 +205,4 @@ SpmdInfo SoftmaxGradInferSpmd(const DistMetaTensor& out,
       DistMetaTensor(out_grad.dims(), out_grad_dist_attr));
 }
 
-}  // namespace distributed
-}  // namespace phi
+}  // namespace phi::distributed
diff --git a/paddle/phi/infermeta/spmd_rules/split.cc b/paddle/phi/infermeta/spmd_rules/split.cc
index e1769392f7238..779f98fea9d24 100644
--- a/paddle/phi/infermeta/spmd_rules/split.cc
+++ b/paddle/phi/infermeta/spmd_rules/split.cc
@@ -227,6 +227,7 @@ SpmdInfo SplitWithNumInferSpmdDynamic(const DistMetaTensor& x,
   SpmdInfo ret;
   ret.first = tmp.first;
   std::vector<TensorDistAttr> out_dist_attrs;
+  out_dist_attrs.reserve(tmp.second.size());
   for (const auto& out : tmp.second) {
     out_dist_attrs.push_back(PADDLE_GET_CONST(TensorDistAttr, out));
   }
diff --git a/paddle/phi/infermeta/spmd_rules/swiglu.cc b/paddle/phi/infermeta/spmd_rules/swiglu.cc
index df6ee24733597..88466785ef3bc 100644
--- a/paddle/phi/infermeta/spmd_rules/swiglu.cc
+++ b/paddle/phi/infermeta/spmd_rules/swiglu.cc
@@ -21,8 +21,7 @@ limitations under the License. */
 #include "paddle/phi/core/distributed/auto_parallel/utils.h"
 #include "paddle/phi/infermeta/spmd_rules/utils.h"
 
-namespace phi {
-namespace distributed {
+namespace phi::distributed {
 
 SpmdInfo SwiGLUInferSpmd(const DistMetaTensor& x, const DistMetaTensor& y) {
   // y.dist_attr() is empty means y is None
@@ -75,5 +74,4 @@ SpmdInfo SwiGLUGradInferSpmd(const DistMetaTensor& x,
   }
 }
 
-}  // namespace distributed
-}  // namespace phi
+}  // namespace phi::distributed
diff --git a/paddle/phi/infermeta/spmd_rules/triu.cc b/paddle/phi/infermeta/spmd_rules/triu.cc
index ed98889de4ea7..24e7770869a22 100644
--- a/paddle/phi/infermeta/spmd_rules/triu.cc
+++ b/paddle/phi/infermeta/spmd_rules/triu.cc
@@ -26,8 +26,8 @@ using phi::distributed::auto_parallel::str_join;
 SpmdInfo TriuInferSpmdBase(const DistMetaTensor& x) {
   auto x_shape = common::vectorize(x.dims());
   int x_ndim = x_shape.size();
-  auto x_dist_attr_src = x.dist_attr();
-  std::vector<int64_t> x_dims_mapping = x_dist_attr_src.dims_mapping();
+  const auto& x_dist_attr_src = x.dist_attr();
+  const std::vector<int64_t>& x_dims_mapping = x_dist_attr_src.dims_mapping();
   PADDLE_ENFORCE_EQ(
       x_ndim,
       x_dims_mapping.size(),
@@ -73,8 +73,9 @@ SpmdInfo TriuInferSpmdReverseBase(const DistMetaTensor& x,
                                   const DistMetaTensor& out) {
   auto out_shape = common::vectorize(out.dims());
   int out_ndim = out_shape.size();
-  auto out_dist_attr_src = out.dist_attr();
-  std::vector<int64_t> out_dims_mapping = out_dist_attr_src.dims_mapping();
+  const auto& out_dist_attr_src = out.dist_attr();
+  const std::vector<int64_t>& out_dims_mapping =
+      out_dist_attr_src.dims_mapping();
   PADDLE_ENFORCE_EQ(
       out_ndim,
       out_dims_mapping.size(),
@@ -119,7 +120,7 @@ SpmdInfo TriuInferSpmdReverse(const DistMetaTensor& x,
 SpmdInfo TriuGradInferSpmdBase(const DistMetaTensor& out_grad) {
   auto out_shape = common::vectorize(out_grad.dims());
   int out_ndim = out_shape.size();
-  auto out_dist_attr_src = out_grad.dist_attr();
+  const auto& out_dist_attr_src = out_grad.dist_attr();
   const std::vector<int64_t>& out_dims_mapping =
       out_dist_attr_src.dims_mapping();
   PADDLE_ENFORCE_EQ(out_ndim,
diff --git a/paddle/phi/infermeta/spmd_rules/unbind.cc b/paddle/phi/infermeta/spmd_rules/unbind.cc
index 79634e8076771..bc0a13cf1761e 100644
--- a/paddle/phi/infermeta/spmd_rules/unbind.cc
+++ b/paddle/phi/infermeta/spmd_rules/unbind.cc
@@ -22,8 +22,7 @@ limitations under the License. */
 #include "paddle/phi/infermeta/spmd_rules/spmd_rule_macro_define.h"
 #include "paddle/phi/infermeta/spmd_rules/utils.h"
 
-namespace phi {
-namespace distributed {
+namespace phi::distributed {
 
 using phi::distributed::auto_parallel::str_join;
 
@@ -171,6 +170,7 @@ SpmdInfo UnbindInferSpmdDynamic(const DistMetaTensor& x, int axis) {
   SpmdInfo ret;
   ret.first = tmp.first;
   std::vector<TensorDistAttr> out_dist_attrs;
+  out_dist_attrs.reserve(tmp.second.size());
   for (const auto& out : tmp.second) {
     out_dist_attrs.push_back(PADDLE_GET_CONST(TensorDistAttr, out));
   }
@@ -178,5 +178,4 @@ SpmdInfo UnbindInferSpmdDynamic(const DistMetaTensor& x, int axis) {
   return ret;
 }
 
-}  // namespace distributed
-}  // namespace phi
+}  // namespace phi::distributed
diff --git a/paddle/phi/infermeta/spmd_rules/unsqueeze.cc b/paddle/phi/infermeta/spmd_rules/unsqueeze.cc
index 9aab6676bd383..dc3b76bd05b6e 100644
--- a/paddle/phi/infermeta/spmd_rules/unsqueeze.cc
+++ b/paddle/phi/infermeta/spmd_rules/unsqueeze.cc
@@ -25,8 +25,7 @@
 #include "paddle/phi/infermeta/spmd_rules/reshape.h"
 #include "paddle/phi/infermeta/spmd_rules/utils.h"
 
-namespace phi {
-namespace distributed {
+namespace phi::distributed {
 
 using phi::distributed::auto_parallel::str_join;
 
@@ -239,5 +238,4 @@ SpmdInfo UnsqueezeGradInferSpmd(const DistMetaTensor& xshape,
   return {{xshape.dist_attr(), spmd.first[0]}, {spmd.second[0]}};
 }
 
-}  // namespace distributed
-}  // namespace phi
+}  // namespace phi::distributed
diff --git a/paddle/phi/infermeta/spmd_rules/utils.cc b/paddle/phi/infermeta/spmd_rules/utils.cc
index 336924dd5e951..995f152777655 100644
--- a/paddle/phi/infermeta/spmd_rules/utils.cc
+++ b/paddle/phi/infermeta/spmd_rules/utils.cc
@@ -22,8 +22,7 @@ limitations under the License. */
 #include "paddle/phi/core/distributed/auto_parallel/utils.h"
 #include "paddle/phi/core/enforce.h"
 
-namespace phi {
-namespace distributed {
+namespace phi::distributed {
 
 using phi::distributed::auto_parallel::str_join;
 
@@ -605,5 +604,4 @@ TensorDistAttr ReduceGradBroadCastDims(const TensorDistAttr& input,
   return grad_out;
 }
 
-}  // namespace distributed
-}  // namespace phi
+}  // namespace phi::distributed
diff --git a/paddle/phi/infermeta/spmd_rules/where.cc b/paddle/phi/infermeta/spmd_rules/where.cc
index 6499d3f37635f..d823ae4ce75cb 100644
--- a/paddle/phi/infermeta/spmd_rules/where.cc
+++ b/paddle/phi/infermeta/spmd_rules/where.cc
@@ -19,8 +19,7 @@ limitations under the License. */
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/infermeta/spmd_rules/utils.h"
 
-namespace phi {
-namespace distributed {
+namespace phi::distributed {
 using phi::distributed::auto_parallel::str_join;
 
 SpmdInfo WhereInferSpmd(const DistMetaTensor& condition,
@@ -397,5 +396,4 @@ SpmdInfo WhereGradInferSpmd(const DistMetaTensor& condition,
       {cond_dist_attr, x_dist_attr, y_dist_attr, out_grad_dist_attr},
       {x_grad, y_grad});
 }
-}  // namespace distributed
-}  // namespace phi
+}  // namespace phi::distributed
diff --git a/paddle/phi/infermeta/strings/nullary.cc b/paddle/phi/infermeta/strings/nullary.cc
index 80f75c0e06721..ce7eb4a0e5233 100644
--- a/paddle/phi/infermeta/strings/nullary.cc
+++ b/paddle/phi/infermeta/strings/nullary.cc
@@ -13,8 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/phi/infermeta/strings/nullary.h"
 
-namespace phi {
-namespace strings {
+namespace phi::strings {
 
 void CreateInferMeta(const IntArray& shape, MetaTensor* out) {
   const auto& out_dims = common::make_ddim(shape.GetData());
@@ -23,5 +22,4 @@ void CreateInferMeta(const IntArray& shape, MetaTensor* out) {
   out->set_layout(DataLayout::PSTRING_UNION);
 }
 
-}  // namespace strings
-}  // namespace phi
+}  // namespace phi::strings
diff --git a/paddle/phi/infermeta/strings/unary.cc b/paddle/phi/infermeta/strings/unary.cc
index c4c1aa5c990eb..d9ff624ebd995 100644
--- a/paddle/phi/infermeta/strings/unary.cc
+++ b/paddle/phi/infermeta/strings/unary.cc
@@ -16,8 +16,7 @@ limitations under the License. */
 
 #include "paddle/phi/core/infermeta_utils.h"
 
-namespace phi {
-namespace strings {
+namespace phi::strings {
 
 void UnchangedInferMeta(const StringTensorMeta& x_meta, MetaTensor* out) {
   out->set_dims(x_meta.dims);
@@ -31,5 +30,4 @@ void CreateLikeInferMeta(const MetaTensor& x, MetaTensor* out) {
   out->set_layout(x.layout());
 }
 
-}  // namespace strings
-}  // namespace phi
+}  // namespace phi::strings
diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc
index 6c278867d9ac3..8e9cebf46ac0f 100644
--- a/paddle/phi/infermeta/ternary.cc
+++ b/paddle/phi/infermeta/ternary.cc
@@ -645,6 +645,141 @@ void GlobalScatterInferMeta(const MetaTensor& x,
   out->set_dtype(x.dtype());
 }
 
+void AddGroupNormSiluInferMeta(const MetaTensor& x,
+                               const MetaTensor& residual,
+                               const MetaTensor& scale,
+                               const MetaTensor& bias,
+                               float epsilon,
+                               int groups,
+                               const std::string& data_layout_str,
+                               const std::string& activation,
+                               MetaTensor* y,
+                               MetaTensor* residual_out,
+                               MetaTensor* mean,
+                               MetaTensor* variance) {
+  PADDLE_ENFORCE_NE(y,
+                    nullptr,
+                    phi::errors::InvalidArgument(
+                        "The y in GroupNormInferMeta can't be nullptr."));
+  PADDLE_ENFORCE_NE(mean,
+                    nullptr,
+                    phi::errors::InvalidArgument(
+                        "The mean in GroupNormInferMeta can't be nullptr."));
+  PADDLE_ENFORCE_NE(
+      variance,
+      nullptr,
+      phi::errors::InvalidArgument(
+          "The variance in GroupNormInferMeta can't be nullptr."));
+
+  auto x_dim = x.dims();
+  PADDLE_ENFORCE_GE(
+      x_dim.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "The Input(X)'s dimension of Op(group_norm) must be "
+          "greater than 1. But received: %u-D Tensor, which shape is [%s].",
+          x_dim.size(),
+          x_dim));
+
+  const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
+  const int64_t channel_num =
+      (data_layout == DataLayout::kNCHW ? x_dim[1] : x_dim[x_dim.size() - 1]);
+  auto batch_size = x_dim[0];
+  PADDLE_ENFORCE_LE(
+      groups,
+      channel_num,
+      phi::errors::InvalidArgument(
+          "The Attr(groups) of Op(group_norm) must be less than or "
+          "equal to the number of channels. But received: groups "
+          "is [%s], channels is [%s], the Attr(data_layout) "
+          "is [%s]. The error may come from wrong data_layout setting.",
+          groups,
+          channel_num,
+          data_layout_str));
+  PADDLE_ENFORCE_GE(
+      groups,
+      1,
+      phi::errors::InvalidArgument(
+          "The Attr(groups) of Op(group_norm) must be "
+          "greater than or equal to 1. But received: groups is [%s].",
+          groups));
+  PADDLE_ENFORCE_EQ(
+      channel_num % groups,
+      0,
+      phi::errors::InvalidArgument(
+          "Expected number of channels in input to be divisible by "
+          "num_groups, but got input channel is %d and num_groups is %d",
+          channel_num,
+          groups));
+
+  if (scale) {
+    PADDLE_ENFORCE_EQ(
+        scale.dims().size(),
+        1UL,
+        phi::errors::InvalidArgument(
+            "The Input(Scale) of Op(group_norm) should be 1-D Tensor. "
+            "But received: %u-D Tensor, the shape of Input(Scale) is [%s].",
+            scale.dims().size(),
+            scale.dims()));
+    PADDLE_ENFORCE_EQ(
+        scale.dims()[0],
+        channel_num,
+        phi::errors::InvalidArgument(
+            "The Input(Scale)'s first dimension size of Op(group_norm) must "
+            "be equal to the number of channels. But received: the "
+            "Input(Scale)'s first dimension size is [%s], the channels is "
+            "[%s], the Attr(data_layout) is [%s]. The error may come "
+            "from wrong data_layout setting.",
+            scale.dims()[0],
+            channel_num,
+            data_layout_str));
+  }
+  if (bias) {
+    PADDLE_ENFORCE_EQ(
+        bias.dims().size(),
+        1UL,
+        phi::errors::InvalidArgument(
+            "The Input(Bias) of Op(group_norm) should be 1-D Tensor. "
+            "But received: %u-D Tensor, the shape of Input(Bias) is [%s].",
+            bias.dims().size(),
+            bias.dims()));
+    PADDLE_ENFORCE_EQ(
+        bias.dims()[0],
+        channel_num,
+        phi::errors::InvalidArgument(
+            "The Input(Bias)'s first dimension size of "
+            "Op(group_norm) must be equal to the number of channels. "
+            "But received: the Input(Bias)'s first dimension size is [%s], "
+            "the channels is [%s], the Attr(data_layout) is [%s]. The "
+            "error may come from wrong data_layout setting.",
+            bias.dims()[0],
+            channel_num,
+            data_layout_str));
+  }
+  y->set_dims(x_dim);
+  y->set_dtype(x.dtype());
+  y->share_lod(x);
+
+  phi::DataType x_dtype = x.dtype();
+  phi::DataType param_type =
+      (x_dtype == phi::DataType::BFLOAT16 || x_dtype == phi::DataType::FLOAT16)
+          ? phi::DataType::FLOAT32
+          : x_dtype;
+  if (mean) {
+    mean->set_dims({batch_size, groups});
+    mean->set_dtype(param_type);
+  }
+  if (variance) {
+    variance->set_dims({batch_size, groups});
+    variance->set_dtype(param_type);
+  }
+  if (residual_out) {
+    residual_out->set_dims(x_dim);
+    residual_out->set_dtype(x.dtype());
+    residual_out->share_lod(x);
+  }
+}
+
 void GroupNormInferMeta(const MetaTensor& x,
                         const MetaTensor& scale,
                         const MetaTensor& bias,
@@ -1746,7 +1881,7 @@ void SparseMomentumInferMeta(const MetaTensor& param,
                              MetaTensor* velocity_out,
                              MetaTensor* master_param_out) {
   auto lr_dims = common::product(learning_rate.dims());
-  PADDLE_ENFORCE_EQ(lr_dims != 0 && lr_dims == 1,
+  PADDLE_ENFORCE_EQ(lr_dims == 1,
                     true,
                     phi::errors::InvalidArgument(
                         "Learning_rate should be a scalar. But Received "
diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h
index 8732a87c55cd6..1b276846619e6 100644
--- a/paddle/phi/infermeta/ternary.h
+++ b/paddle/phi/infermeta/ternary.h
@@ -144,6 +144,19 @@ void GlobalScatterInferMeta(const MetaTensor& x,
                             bool use_calc_stream,
                             MetaTensor* out);
 
+void AddGroupNormSiluInferMeta(const MetaTensor& x,
+                               const MetaTensor& residual,
+                               const MetaTensor& scale,
+                               const MetaTensor& bias,
+                               float epsilon,
+                               int groups,
+                               const std::string& data_layout,
+                               const std::string& activation,
+                               MetaTensor* y,
+                               MetaTensor* residual_out,
+                               MetaTensor* mean,
+                               MetaTensor* variance);
+
 void GroupNormInferMeta(const MetaTensor& x,
                         const MetaTensor& scale,
                         const MetaTensor& bias,
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index 0aca647dd6a49..6670361ad4f83 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -29,8 +29,6 @@ file(GLOB kernel_primitive_h "primitive/*.h")
 file(
   GLOB kernel_cu
   RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
-  "gpu/*.cu"
-  "gpu/*.cu.cc"
   "gpudnn/*.cu"
   "kps/*.cu"
   "legacy/kps/*.cu"
@@ -40,21 +38,42 @@ file(
   "strings/gpu/*.cu"
   "fusion/gpu/*.cu")
 
+file(
+  GLOB kernel_gpu
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "gpu/*.cu" "gpu/*.cu.cc")
+
 if(APPLE OR WIN32)
   list(REMOVE_ITEM kernel_cu "fusion/gpu/fusion_group_kernel.cu")
   list(REMOVE_ITEM kernel_cu "sparse/gpu/conv_kernel_igemm.cu")
 endif()
 
 if(NOT WITH_DGC)
-  list(REMOVE_ITEM kernel_cu "gpu/dgc_kernel.cu")
+  list(REMOVE_ITEM kernel_gpu "gpu/dgc_kernel.cu")
 endif()
 
 if(DEFINED REDUCE_INFERENCE_LIB_SIZE)
-  list(FILTER kernel_cu EXCLUDE REGEX ".*_grad_kernel\\.cc$")
   list(FILTER kernel_cu EXCLUDE REGEX ".*_grad_kernel\\.cu$")
+  list(FILTER kernel_gpu EXCLUDE REGEX ".*_grad_kernel\\.cc$")
+  list(FILTER kernel_gpu EXCLUDE REGEX ".*_grad_kernel\\.cu$")
 endif()
 
 if(WITH_CUTLASS)
+  add_custom_target(
+    gemm_epilogue_compile_script ALL
+    COMMAND bash compile.sh "${PYTHON_EXECUTABLE}" "${CUDA_TOOLKIT_ROOT_DIR}"
+            \"${NVCC_ARCH_BIN}\" "${CMAKE_COMMAND}"
+    WORKING_DIRECTORY
+      ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/gemm_epilogue
+    COMMENT "GemmEpilogue compile script")
+  add_custom_target(
+    fused_conv2d_add_act_compile_script ALL
+    COMMAND bash compile.sh "${PYTHON_EXECUTABLE}" "${CUDA_TOOLKIT_ROOT_DIR}"
+            \"${NVCC_ARCH_BIN}\" "${CMAKE_COMMAND}"
+    WORKING_DIRECTORY
+      ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/conv2d
+    COMMENT "FusedConv2dAddAct compile script")
+
   execute_process(
     COMMAND
       ${PYTHON_EXECUTABLE}
@@ -201,6 +220,15 @@ if(WITH_ROCM)
   list(
     REMOVE_ITEM
     kernel_cu
+    "gpudnn/mha_cudnn_frontend.cu"
+    "fusion/gpu/blha_get_max_len.cu"
+    "fusion/gpu/block_multi_head_attention_kernel.cu"
+    "fusion/gpu/fused_bn_add_activation_grad_kernel.cu"
+    "fusion/gpu/fused_bn_add_activation_kernel.cu"
+    "fusion/gpu/fusion_transpose_flatten_concat_kernel.cu")
+  list(
+    REMOVE_ITEM
+    kernel_gpu
     "gpu/affine_grid_grad_kernel.cu"
     "gpu/apply_per_channel_scale_kernel.cu"
     "gpu/cholesky_solve_kernel.cu"
@@ -213,13 +241,7 @@ if(WITH_ROCM)
     "gpu/put_along_axis_grad_kernel.cu"
     "gpu/put_along_axis_kernel.cu"
     "gpu/qr_kernel.cu"
-    "gpu/svd_kernel.cu"
-    "gpudnn/mha_cudnn_frontend.cu"
-    "fusion/gpu/blha_get_max_len.cu"
-    "fusion/gpu/block_multi_head_attention_kernel.cu"
-    "fusion/gpu/fused_bn_add_activation_grad_kernel.cu"
-    "fusion/gpu/fused_bn_add_activation_kernel.cu"
-    "fusion/gpu/fusion_transpose_flatten_concat_kernel.cu")
+    "gpu/svd_kernel.cu")
 endif()
 
 set(cc_search_pattern
@@ -276,6 +298,8 @@ file(
 if(WITH_GPU OR WITH_ROCM)
   collect_srcs(kernels_srcs SRCS ${kernel_cu})
   kernel_declare("${kernel_cu}")
+  collect_srcs(kernels_gpu_srcs SRCS ${kernel_gpu})
+  kernel_declare("${kernel_gpu}")
 endif()
 
 if(WITH_XPU)
diff --git a/paddle/phi/kernels/activation_grad_kernel.h b/paddle/phi/kernels/activation_grad_kernel.h
index b2fae7b0406e0..8aed27bb59ea9 100644
--- a/paddle/phi/kernels/activation_grad_kernel.h
+++ b/paddle/phi/kernels/activation_grad_kernel.h
@@ -307,7 +307,6 @@ DECLARE_ACTIVATION_GRAD_KERNEL_NODEP(Floor);
 DECLARE_ACTIVATION_GRAD_KERNEL_NODEP(Ceil);
 
 DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu, alpha);
-DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(ThresholdedRelu, threshold);
 DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink, lambda);
 DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink, threshold);
 DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Logit, eps);
@@ -318,5 +317,6 @@ DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(HardTanh, t_min, t_max);
 DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(STanh, scale_a, scale_b);
 DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(Softplus, beta, threshold);
 DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(HardSigmoid, slope, offset);
+DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(ThresholdedRelu, threshold, value);
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/activation_kernel.h b/paddle/phi/kernels/activation_kernel.h
index 70c0187e68865..bf3cb325160d3 100644
--- a/paddle/phi/kernels/activation_kernel.h
+++ b/paddle/phi/kernels/activation_kernel.h
@@ -74,7 +74,6 @@ DECLARE_ACTIVATION_KERNEL(Ceil)
 DECLARE_ACTIVATION_KERNEL(Negative)
 
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(LeakyRelu, alpha)
-DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu, threshold)
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(SoftShrink, lambda)
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Mish, threshold)
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(HardShrink, threshold)
@@ -87,6 +86,7 @@ DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(HardTanh, t_min, t_max)
 DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(STanh, scale_a, scale_b)
 DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(Softplus, beta, threshold)
 DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(HardSigmoid, slope, offset)
+DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(ThresholdedRelu, threshold, value)
 
 template <typename T, typename Context>
 void HardSwishKernel(const Context& dev_ctx,
diff --git a/paddle/phi/kernels/check_memory_continue_kernel.cc b/paddle/phi/kernels/check_memory_continue_kernel.cc
index ed2722f9d8411..29149ae0f768a 100644
--- a/paddle/phi/kernels/check_memory_continue_kernel.cc
+++ b/paddle/phi/kernels/check_memory_continue_kernel.cc
@@ -20,7 +20,7 @@
 
 #include "paddle/phi/core/kernel_registry.h"
 
-#include "paddle/phi/backends/device_memory_aligment.h"
+#include "paddle/phi/backends/device_memory_alignment.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/coalesce_tensor_kernel.cc b/paddle/phi/kernels/coalesce_tensor_kernel.cc
index a60369af449f4..fea53f55ce8df 100644
--- a/paddle/phi/kernels/coalesce_tensor_kernel.cc
+++ b/paddle/phi/kernels/coalesce_tensor_kernel.cc
@@ -20,7 +20,7 @@
 #include "glog/logging.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/backends/device_memory_aligment.h"
+#include "paddle/phi/backends/device_memory_alignment.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/phi/kernels/cpu/activation_grad_kernel.cc b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
index 3f26f8c388e66..b8ced8d4defe2 100644
--- a/paddle/phi/kernels/cpu/activation_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
@@ -155,9 +155,6 @@ DEFINE_CPU_ACTIVATION_GRAD_KERNEL_NODEP(Ceil, ZeroGradFunctor);
 DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu,
                                                LeakyReluGradFunctor,
                                                alpha);
-DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(ThresholdedRelu,
-                                               ThresholdedReluGradFunctor,
-                                               threshold);
 DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink,
                                                SoftShrinkGradFunctor,
                                                lambda);
@@ -188,6 +185,10 @@ DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(HardSigmoid,
                                                  HardSigmoidGradFunctor,
                                                  slope,
                                                  offset);
+DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(ThresholdedRelu,
+                                               ThresholdedReluGradFunctor,
+                                               threshold,
+                                               value);
 
 template <typename T, typename Context>
 void SiluGradKernel(const Context& dev_ctx,
diff --git a/paddle/phi/kernels/cpu/activation_kernel.cc b/paddle/phi/kernels/cpu/activation_kernel.cc
index 92acf104fedcf..fda8493c9f452 100644
--- a/paddle/phi/kernels/cpu/activation_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_kernel.cc
@@ -106,9 +106,6 @@ DEFINE_CPU_ACTIVATION_KERNEL_WITH_INT_IN_FLOAT_OUT(Exp, ExpFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL_WITH_INT_IN_FLOAT_OUT(Expm1, Expm1Functor)
 
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, LeakyReluFunctor, alpha)
-DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu,
-                                     ThresholdedReluFunctor,
-                                     threshold)
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(Mish, MishFunctor, threshold)
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(HardShrink, HardShrinkFunctor, threshold)
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(SoftShrink, SoftShrinkFunctor, lambda)
@@ -122,6 +119,10 @@ DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(HardSigmoid,
                                      HardSigmoidFunctor,
                                      slope,
                                      offset)
+DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(ThresholdedRelu,
+                                     ThresholdedReluFunctor,
+                                     threshold,
+                                     value)
 
 template <typename T, typename Context>
 void HardSwishKernel(const Context& dev_ctx,
diff --git a/paddle/phi/kernels/cpu/cumprod_kernel.cc b/paddle/phi/kernels/cpu/cumprod_kernel.cc
index f39bddbb443ba..422f566c6612e 100644
--- a/paddle/phi/kernels/cpu/cumprod_kernel.cc
+++ b/paddle/phi/kernels/cpu/cumprod_kernel.cc
@@ -32,8 +32,16 @@ void CumprodKernel(const Context& dev_ctx,
                    DenseTensor* out) {
   const DenseTensor* x = &input;
   auto* x_data = x->data<T>();
-  auto* out_data = dev_ctx.template Alloc<T>(out);
+  auto* out_ptr = dev_ctx.template Alloc<T>(out);
   DDim shape = x->dims();
+  DenseTensor out_tmp;
+  T* out_data = nullptr;
+  if (x_data == out_ptr) {
+    out_tmp.Resize(shape);
+    out_data = dev_ctx.template Alloc<T>(&out_tmp);
+  } else {
+    out_data = out_ptr;
+  }
 
   size_t outer_dim = 1;
   size_t mid_dim = 1;
@@ -88,6 +96,9 @@ void CumprodKernel(const Context& dev_ctx,
       }
     }
   }
+  if (x_data == out_ptr) {
+    memcpy(out_ptr, out_data, out->numel() * sizeof(T));
+  }
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/cpu/fill_diagonal_grad_kernel.cc b/paddle/phi/kernels/cpu/fill_diagonal_grad_kernel.cc
index 204c544e2d95f..1e2954cfeac91 100644
--- a/paddle/phi/kernels/cpu/fill_diagonal_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/fill_diagonal_grad_kernel.cc
@@ -35,7 +35,7 @@ void FillDiagonalGradKernel(const Context& ctx,
     auto size = x_grad->numel();
     auto wrapsize = std::min(size, dx_dims[1] * dx_dims[1]);
 
-    // The wrap mode supported only the dims equels to 2; In wrap mode, the
+    // The wrap mode supported only the dims equals to 2; In wrap mode, the
     // value will be filled in cycles
     if (wrap) {
       wrapsize = size;
diff --git a/paddle/phi/kernels/cpu/fill_diagonal_kernel.cc b/paddle/phi/kernels/cpu/fill_diagonal_kernel.cc
index c5888f5d30ed2..b38f3403df1b7 100644
--- a/paddle/phi/kernels/cpu/fill_diagonal_kernel.cc
+++ b/paddle/phi/kernels/cpu/fill_diagonal_kernel.cc
@@ -36,7 +36,7 @@ void FillDiagonalKernel(const Context& ctx,
   auto strides = funcs::CalStride(out_dims);
   auto size = out->numel();
 
-  // The wrap mode supported only the dims equels to 2; In wrap mode, the
+  // The wrap mode supported only the dims equals to 2; In wrap mode, the
   // value will be filled in cycles
   if (!wrap) {
     size = std::min(size, out_dims[1] * out_dims[1]);
diff --git a/paddle/phi/kernels/cpu/fill_diagonal_tensor_kernel.cc b/paddle/phi/kernels/cpu/fill_diagonal_tensor_kernel.cc
index 5d0fa3c8b5753..0d43f5dec05d7 100644
--- a/paddle/phi/kernels/cpu/fill_diagonal_tensor_kernel.cc
+++ b/paddle/phi/kernels/cpu/fill_diagonal_tensor_kernel.cc
@@ -85,7 +85,7 @@ void FillDiagonalTensorKernel(const Context &ctx,
 
   phi::Copy(ctx, x, ctx.GetPlace(), false, out);
   auto out_dims = out->dims();
-  auto matdims = y.dims();
+  const auto &matdims = y.dims();
   auto fill_dims = common::flatten_to_2d(matdims, matdims.size() - 1);
 
   std::array<int64_t, 2> new_dims = {};
diff --git a/paddle/phi/kernels/cpu/inverse_grad_kernel.cc b/paddle/phi/kernels/cpu/inverse_grad_kernel.cc
index 97c10e69c8eab..5014cfd0f95c7 100644
--- a/paddle/phi/kernels/cpu/inverse_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/inverse_grad_kernel.cc
@@ -16,5 +16,11 @@
 
 #include "paddle/phi/core/kernel_registry.h"
 
-PD_REGISTER_KERNEL(
-    inverse_grad, CPU, ALL_LAYOUT, phi::InverseGradKernel, float, double) {}
+PD_REGISTER_KERNEL(inverse_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::InverseGradKernel,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/inverse_kernel.cc b/paddle/phi/kernels/cpu/inverse_kernel.cc
index 4b21718eca3f2..6fecef6f888dc 100644
--- a/paddle/phi/kernels/cpu/inverse_kernel.cc
+++ b/paddle/phi/kernels/cpu/inverse_kernel.cc
@@ -16,5 +16,11 @@
 
 #include "paddle/phi/core/kernel_registry.h"
 
-PD_REGISTER_KERNEL(
-    inverse, CPU, ALL_LAYOUT, phi::InverseKernel, float, double) {}
+PD_REGISTER_KERNEL(inverse,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::InverseKernel,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/fluid/operators/ops_signature/number_count_sig.cc b/paddle/phi/kernels/cpu/lstm_grad_kernel.cc
similarity index 58%
rename from paddle/fluid/operators/ops_signature/number_count_sig.cc
rename to paddle/phi/kernels/cpu/lstm_grad_kernel.cc
index 48e0b4fce9ac1..ddaa85c8bdce1 100644
--- a/paddle/fluid/operators/ops_signature/number_count_sig.cc
+++ b/paddle/phi/kernels/cpu/lstm_grad_kernel.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,15 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/core/compat/op_utils.h"
+#include <memory>
+#include <string>
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/lstm_kernel_impl.h"
 
-namespace phi {
-
-KernelSignature NumberCountOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature("number_count", {"numbers"}, {"upper_range"}, {"Out"});
-}
-
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(number_count, phi::NumberCountOpArgumentMapping);
+PD_REGISTER_KERNEL(
+    lstm_grad, CPU, ALL_LAYOUT, phi::LSTMGradKernel, float, double) {}
diff --git a/test/deprecated/cpp_extension/custom_power.h b/paddle/phi/kernels/cpu/lstm_kernel.cc
similarity index 55%
rename from test/deprecated/cpp_extension/custom_power.h
rename to paddle/phi/kernels/cpu/lstm_kernel.cc
index f2cf8acb9cd52..848ba68bb3b76 100644
--- a/test/deprecated/cpp_extension/custom_power.h
+++ b/paddle/phi/kernels/cpu/lstm_kernel.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -11,18 +11,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#pragma once
 
-#include "paddle/extension.h"
+#include <memory>
+#include <string>
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/lstm_kernel_impl.h"
 
-struct Power {
-  Power(int A, int B) {
-    tensor_ = paddle::ones({A, B}, phi::DataType::FLOAT32, phi::CPUPlace());
-  }
-  explicit Power(paddle::Tensor x) { tensor_ = x; }
-  paddle::Tensor forward() { return paddle::experimental::pow(tensor_, 2); }
-  paddle::Tensor get() const { return tensor_; }
-
- private:
-  paddle::Tensor tensor_;
-};
+PD_REGISTER_KERNEL(lstm, CPU, ALL_LAYOUT, phi::LSTMKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/meshgrid_grad_kernel.cc b/paddle/phi/kernels/cpu/meshgrid_grad_kernel.cc
index 5b43fb02b5117..9d1319e0b5e4a 100644
--- a/paddle/phi/kernels/cpu/meshgrid_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/meshgrid_grad_kernel.cc
@@ -25,4 +25,6 @@ PD_REGISTER_KERNEL(meshgrid_grad,
                    float,
                    double,
                    int,
-                   int64_t) {}
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/meshgrid_kernel.cc b/paddle/phi/kernels/cpu/meshgrid_kernel.cc
index 35e43f7bbc85e..a0239da6bb128 100644
--- a/paddle/phi/kernels/cpu/meshgrid_kernel.cc
+++ b/paddle/phi/kernels/cpu/meshgrid_kernel.cc
@@ -25,4 +25,6 @@ PD_REGISTER_KERNEL(meshgrid,
                    float,
                    double,
                    int,
-                   int64_t) {}
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/pool_grad_kernel.cc b/paddle/phi/kernels/cpu/pool_grad_kernel.cc
index 4511d9164f002..f262c046e1687 100644
--- a/paddle/phi/kernels/cpu/pool_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/pool_grad_kernel.cc
@@ -19,6 +19,8 @@
 
 PD_REGISTER_KERNEL(
     pool2d_grad, CPU, ALL_LAYOUT, phi::Pool2dGradKernel, float, double) {}
+PD_REGISTER_KERNEL(
+    lp_pool2d_grad, CPU, ALL_LAYOUT, phi::LPPool2dGradKernel, float, double) {}
 PD_REGISTER_KERNEL(pool2d_double_grad,
                    CPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/cpu/pool_kernel.cc b/paddle/phi/kernels/cpu/pool_kernel.cc
index e606173919d74..d4c66eedc54ff 100644
--- a/paddle/phi/kernels/cpu/pool_kernel.cc
+++ b/paddle/phi/kernels/cpu/pool_kernel.cc
@@ -18,6 +18,8 @@
 #include "paddle/phi/kernels/impl/pool_kernel_impl.h"
 
 PD_REGISTER_KERNEL(pool2d, CPU, ALL_LAYOUT, phi::Pool2dKernel, float, double) {}
+PD_REGISTER_KERNEL(
+    lp_pool2d, CPU, ALL_LAYOUT, phi::LPPool2dKernel, float, double) {}
 PD_REGISTER_KERNEL(max_pool2d_with_index,
                    CPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/cpu/random_routing_kernel.cc b/paddle/phi/kernels/cpu/random_routing_kernel.cc
index cdeab98f4c1ab..f236ad4d9370c 100644
--- a/paddle/phi/kernels/cpu/random_routing_kernel.cc
+++ b/paddle/phi/kernels/cpu/random_routing_kernel.cc
@@ -16,8 +16,7 @@
 #include "paddle/common/errors.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-namespace phi {
-namespace fusion {
+namespace phi::fusion {
 
 template <typename T, typename Context>
 void RandomRoutingKernel(const Context& dev_ctx,
@@ -29,8 +28,7 @@ void RandomRoutingKernel(const Context& dev_ctx,
       "Do not support expert count op for cpu kernel now."));
 }
 
-}  // namespace fusion
-}  // namespace phi
+}  // namespace phi::fusion
 
 PD_REGISTER_KERNEL(random_routing,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/tdm_child_kernel.cc b/paddle/phi/kernels/cpu/tdm_child_kernel.cc
index 246f2113d65e8..3fabbba572f7e 100644
--- a/paddle/phi/kernels/cpu/tdm_child_kernel.cc
+++ b/paddle/phi/kernels/cpu/tdm_child_kernel.cc
@@ -104,7 +104,7 @@ void TDMChildKernel(const Context &dev_ctx,
                     const phi::DenseTensor &x,
                     const phi::DenseTensor &tree_info,
                     int child_nums,
-                    int dtype,
+                    phi::DataType dtype,
                     phi::DenseTensor *child,
                     phi::DenseTensor *leaf_mask) {
   const auto &input_type = x.dtype();
@@ -132,7 +132,7 @@ void TDMChildKernel(const Context &dev_ctx,
           DataTypeToString(DataType::INT32),
           DataTypeToString(DataType::INT64)));
 
-  auto output_type = phi::TransToPhiDataType(dtype);
+  auto output_type = dtype;
   bool out_type_match =
       output_type == DataType::INT32 || output_type == DataType::INT64;
   PADDLE_ENFORCE_EQ(out_type_match,
diff --git a/paddle/phi/kernels/cpu/tile_kernel.cc b/paddle/phi/kernels/cpu/tile_kernel.cc
index 2320c30310a64..30eb1d5cd6c47 100644
--- a/paddle/phi/kernels/cpu/tile_kernel.cc
+++ b/paddle/phi/kernels/cpu/tile_kernel.cc
@@ -27,5 +27,6 @@ PD_REGISTER_KERNEL(tile,
                    double,
                    int,
                    int64_t,
+                   phi::dtype::float16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h
index ba1d9873ec2a4..27223dad0c1de 100644
--- a/paddle/phi/kernels/funcs/activation_functor.h
+++ b/paddle/phi/kernels/funcs/activation_functor.h
@@ -1825,22 +1825,25 @@ struct LeakyReluGradGradFunctor : public BaseActivationFunctor<T> {
 template <typename T>
 struct ThresholdedReluFunctor : public BaseActivationFunctor<T> {
   float threshold;
+  float value;
   typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
+    return {{"threshold", &threshold}, {"value", &value}};
   }
 
   template <typename Device, typename X, typename Out>
   void operator()(Device d, X x, Out out) const {
     auto th = static_cast<T>(threshold);  // NOLINT
-    out.device(d) = (x > th).template cast<T>() * x;
+    out.device(d) = (x > th).template cast<T>() * x +
+                    (x <= th).template cast<T>() * static_cast<T>(value);
   }
 };
 
 template <typename T>
 struct ThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
   float threshold;
+  float value;
   typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
+    return {{"threshold", &threshold}, {"value", &value}};
   }
 
   template <typename Device,
@@ -4230,16 +4233,16 @@ struct CudaHardTanhGradFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct CudaThresholdedReluFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
   float threshold;
+  float value;
 
   typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
+    return {{"threshold", &threshold}, {"value", &value}};
   }
 
-  // thresholded_relu(x) = x > threshold ? x : 0
+  // thresholded_relu(x, threshold, value) = x > threshold ? x : value
   __device__ __forceinline__ T operator()(const T x) const {
-    return x > static_cast<T>(threshold) ? x : zero;
+    return x > static_cast<T>(threshold) ? x : static_cast<T>(value);
   }
 };
 
@@ -4247,9 +4250,10 @@ template <typename T>
 struct CudaThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
   T zero = static_cast<T>(0.0f);
   float threshold;
+  float value;
 
   typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
+    return {{"threshold", &threshold}, {"value", &value}};
   }
 
   // dx = x > threshold ? dout : 0
diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
index 96b2128eee16c..a58b5998a6703 100644
--- a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
@@ -685,6 +685,63 @@ struct CUBlas<phi::dtype::complex<float>> {
         ldb,
         batch_size));
   }
+
+  static void GETRF_BATCH(cublasHandle_t handle,
+                          int n,
+                          phi::dtype::complex<float> **A,
+                          int lda,
+                          int *ipiv,
+                          int *info,
+                          int batch_size) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasCgetrfBatched(
+        handle,
+        n,
+        reinterpret_cast<cuFloatComplex **>(A),
+        lda,
+        ipiv,
+        info,
+        batch_size));
+  }
+
+  static void GETRI_BATCH(cublasHandle_t handle,
+                          int n,
+                          const phi::dtype::complex<float> **A,
+                          int lda,
+                          const int *ipiv,
+                          phi::dtype::complex<float> **Ainv,
+                          int ldc,
+                          int *info,
+                          int batch_size) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasCgetriBatched(
+        handle,
+        n,
+        reinterpret_cast<const cuFloatComplex **>(A),
+        lda,
+        ipiv,
+        reinterpret_cast<cuFloatComplex **>(Ainv),
+        ldc,
+        info,
+        batch_size));
+  }
+
+  static void MATINV_BATCH(cublasHandle_t handle,
+                           int n,
+                           const phi::dtype::complex<float> **A,
+                           int lda,
+                           phi::dtype::complex<float> **Ainv,
+                           int lda_inv,
+                           int *info,
+                           int batch_size) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasCmatinvBatched(
+        handle,
+        n,
+        reinterpret_cast<const cuFloatComplex **>(A),
+        lda,
+        reinterpret_cast<cuFloatComplex **>(Ainv),
+        lda_inv,
+        info,
+        batch_size));
+  }
 };
 
 template <>
@@ -923,6 +980,63 @@ struct CUBlas<phi::dtype::complex<double>> {
         "cublasGemmEx is not supported on cuda <= 7.5"));
 #endif
   }
+
+  static void GETRF_BATCH(cublasHandle_t handle,
+                          int n,
+                          phi::dtype::complex<double> **A,
+                          int lda,
+                          int *ipiv,
+                          int *info,
+                          int batch_size) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasZgetrfBatched(
+        handle,
+        n,
+        reinterpret_cast<cuDoubleComplex **>(A),
+        lda,
+        ipiv,
+        info,
+        batch_size));
+  }
+
+  static void GETRI_BATCH(cublasHandle_t handle,
+                          int n,
+                          const phi::dtype::complex<double> **A,
+                          int lda,
+                          const int *ipiv,
+                          phi::dtype::complex<double> **Ainv,
+                          int ldc,
+                          int *info,
+                          int batch_size) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasZgetriBatched(
+        handle,
+        n,
+        reinterpret_cast<const cuDoubleComplex **>(A),
+        lda,
+        ipiv,
+        reinterpret_cast<cuDoubleComplex **>(Ainv),
+        ldc,
+        info,
+        batch_size));
+  }
+
+  static void MATINV_BATCH(cublasHandle_t handle,
+                           int n,
+                           const phi::dtype::complex<double> **A,
+                           int lda,
+                           phi::dtype::complex<double> **Ainv,
+                           int lda_inv,
+                           int *info,
+                           int batch_size) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasZmatinvBatched(
+        handle,
+        n,
+        reinterpret_cast<const cuDoubleComplex **>(A),
+        lda,
+        reinterpret_cast<cuDoubleComplex **>(Ainv),
+        lda_inv,
+        info,
+        batch_size));
+  }
 };
 
 template <>
diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cc b/paddle/phi/kernels/funcs/concat_and_split_functor.cc
index fd49748666a6e..c42bbbd3a5318 100644
--- a/paddle/phi/kernels/funcs/concat_and_split_functor.cc
+++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cc
@@ -14,8 +14,7 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 
-namespace phi {
-namespace funcs {
+namespace phi::funcs {
 
 /*
  * All tensors' dimension should be the same and the values of
@@ -132,5 +131,4 @@ struct SplitFunctor<phi::CPUContext, T> {
 
 FOR_ALL_TYPES(DEFINE_FUNCTOR);
 
-}  // namespace funcs
-}  // namespace phi
+}  // namespace phi::funcs
diff --git a/paddle/phi/kernels/funcs/data_layout_transform.cc b/paddle/phi/kernels/funcs/data_layout_transform.cc
index 383881c6cc3c9..fc67ef927f4cc 100644
--- a/paddle/phi/kernels/funcs/data_layout_transform.cc
+++ b/paddle/phi/kernels/funcs/data_layout_transform.cc
@@ -28,8 +28,7 @@
 #include "paddle/phi/backends/onednn/onednn_reuse.h"
 #endif
 
-namespace phi {
-namespace funcs {
+namespace phi::funcs {
 
 #ifdef PADDLE_WITH_DNNL
 
@@ -131,5 +130,4 @@ void TransDataLayoutFromOneDNN(DataLayout in_layout,
 
 #endif
 
-}  // namespace funcs
-}  // namespace phi
+}  // namespace phi::funcs
diff --git a/paddle/phi/kernels/funcs/eigen/erf.cc b/paddle/phi/kernels/funcs/eigen/erf.cc
index 63d3bba30f99a..5734c6eed61e5 100644
--- a/paddle/phi/kernels/funcs/eigen/erf.cc
+++ b/paddle/phi/kernels/funcs/eigen/erf.cc
@@ -16,8 +16,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 #include "paddle/phi/kernels/funcs/eigen/extensions.h"
 
-namespace phi {
-namespace funcs {
+namespace phi::funcs {
 
 template <typename T>
 struct EigenErf<Eigen::DefaultDevice, T> {
@@ -56,5 +55,4 @@ INSTANTIATION(EigenErf);
 INSTANTIATION(EigenErfGrad);
 #undef INSTANTIATION
 
-}  // namespace funcs
-}  // namespace phi
+}  // namespace phi::funcs
diff --git a/paddle/phi/kernels/funcs/eigen/pad.cc b/paddle/phi/kernels/funcs/eigen/pad.cc
index 946bff40544ee..c51cd25e45c29 100644
--- a/paddle/phi/kernels/funcs/eigen/pad.cc
+++ b/paddle/phi/kernels/funcs/eigen/pad.cc
@@ -15,8 +15,7 @@ limitations under the License. */
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 
-namespace phi {
-namespace funcs {
+namespace phi::funcs {
 
 template <typename T, int Rank>
 struct EigenPad<Eigen::DefaultDevice, T, Rank> {
@@ -72,5 +71,4 @@ INSTANTIATION(EigenPad, dtype::complex<float>);
 INSTANTIATION(EigenPad, dtype::complex<double>);
 #undef INSTANTIATION
 
-}  // namespace funcs
-}  // namespace phi
+}  // namespace phi::funcs
diff --git a/paddle/phi/kernels/funcs/eigen/reverse.cc b/paddle/phi/kernels/funcs/eigen/reverse.cc
index bd1996956cd38..7b37d56b79e0e 100644
--- a/paddle/phi/kernels/funcs/eigen/reverse.cc
+++ b/paddle/phi/kernels/funcs/eigen/reverse.cc
@@ -13,8 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 
-namespace phi {
-namespace funcs {
+namespace phi::funcs {
 
 template <typename T, int Rank>
 struct EigenReverse<Eigen::DefaultDevice, T, Rank> {
@@ -46,5 +45,4 @@ INSTANTIATION(EigenReverse, float);
 INSTANTIATION(EigenReverse, double);
 #undef INSTANTIATION
 
-}  // namespace funcs
-}  // namespace phi
+}  // namespace phi::funcs
diff --git a/paddle/phi/kernels/funcs/elementwise/elementwise_op_function.h b/paddle/phi/kernels/funcs/elementwise/elementwise_op_function.h
index 0326881940aaa..82516cd3c6d07 100644
--- a/paddle/phi/kernels/funcs/elementwise/elementwise_op_function.h
+++ b/paddle/phi/kernels/funcs/elementwise/elementwise_op_function.h
@@ -1165,7 +1165,7 @@ void FusedElemwiseAndActGradComputeWithBroadcast(
                                             UseIntermediateOut,
                                             BcastY,
                                             SameShapeOfIntermediateOutAndOut>(
-          dev_ctx,
+          reinterpret_cast<const phi::GPUContext &>(dev_ctx),
           x_data,
           y_data,
           intermediate_out == nullptr ? nullptr : intermediate_out->data<T>(),
@@ -1214,7 +1214,7 @@ void FusedElemwiseAndActGradComputeWithBroadcast(
                                             UseIntermediateOut,
                                             BcastY,
                                             SameShapeOfIntermediateOutAndOut>(
-          dev_ctx.stream(),
+          reinterpret_cast<const phi::GPUContext &>(dev_ctx).stream(),
           x_data,
           y_data,
           intermediate_out == nullptr ? nullptr : intermediate_out->data<T>(),
diff --git a/paddle/phi/kernels/funcs/fft.cc b/paddle/phi/kernels/funcs/fft.cc
index beb0a98636039..13a3822b26005 100644
--- a/paddle/phi/kernels/funcs/fft.cc
+++ b/paddle/phi/kernels/funcs/fft.cc
@@ -25,11 +25,11 @@
 #include "extern_pocketfft/pocketfft_hdronly.h"
 #endif
 
-namespace phi {
-namespace funcs {
+namespace phi::funcs {
 #if defined(PADDLE_WITH_ONEMKL)
 
-namespace detail {
+}  // namespace phi::funcs
+namespace phi::funcs::detail {
 // Execute a general fft operation (can be c2c, onesided r2c or onesided c2r)
 template <typename Ti, typename To>
 void exec_fft(const phi::CPUContext& ctx,
@@ -141,7 +141,8 @@ void exec_fft(const phi::CPUContext& ctx,
   TransposeKernel<To, phi::CPUContext>(
       ctx, transposed_output, reverse_dim_permute, out);
 }
-}  // namespace detail
+}  // namespace phi::funcs::detail
+namespace phi::funcs {
 
 template <typename Ti, typename To>
 struct FFTC2CFunctor<phi::CPUContext, Ti, To> {
@@ -192,7 +193,8 @@ struct FFTC2RFunctor<phi::CPUContext, Ti, To> {
 };
 
 #elif defined(PADDLE_WITH_POCKETFFT)
-namespace detail {
+}  // namespace phi::funcs
+namespace phi::funcs::detail {
 template <typename T>
 static T compute_factor(size_t size, FFTNormMode normalization) {
   constexpr auto one = static_cast<T>(1);
@@ -206,7 +208,8 @@ static T compute_factor(size_t size, FFTNormMode normalization) {
   }
   PADDLE_THROW(phi::errors::InvalidArgument("Unsupported normalization type"));
 }
-}  // namespace detail
+}  // namespace phi::funcs::detail
+namespace phi::funcs {
 
 template <typename Ti, typename To>
 struct FFTC2CFunctor<phi::CPUContext, Ti, To> {
@@ -374,5 +377,4 @@ template struct FFTC2RFunctor<phi::CPUContext, complex64_t, float>;
 template struct FFTC2RFunctor<phi::CPUContext, complex128_t, double>;
 template struct FFTR2CFunctor<phi::CPUContext, float, complex64_t>;
 template struct FFTR2CFunctor<phi::CPUContext, double, complex128_t>;
-}  // namespace funcs
-}  // namespace phi
+}  // namespace phi::funcs
diff --git a/paddle/phi/kernels/funcs/fft_fill_conj.h b/paddle/phi/kernels/funcs/fft_fill_conj.h
index c47257818f3a3..594dccd99db23 100644
--- a/paddle/phi/kernels/funcs/fft_fill_conj.h
+++ b/paddle/phi/kernels/funcs/fft_fill_conj.h
@@ -189,26 +189,23 @@ template <typename T>
 struct FFTFillConjGradFunctor {
   T* input_;
   const size_t axis_;
-  const int64_t* strides_;
+  const int64_t stride_to_last_axis;
+  const int64_t stride_second_to_last_axis;
   const size_t double_length_;
 
   FFTFillConjGradFunctor(T* input,
                          size_t axis,
-                         const int64_t* strides,
+                         int64_t stride_second_to_last_axis,
+                         int64_t stride_to_last_axis,
                          size_t double_length)
       : input_(input),
         axis_(axis),
-        strides_(strides),
+        stride_to_last_axis(stride_to_last_axis),
+        stride_second_to_last_axis(stride_second_to_last_axis),
         double_length_(double_length) {}
 
   HOSTDEVICE void operator()(size_t index) {
-    size_t offtset = index;  // back
-    size_t index_i;
-    for (size_t i = 0; i <= axis_; i++) {
-      index_i = offtset / strides_[i];
-      offtset %= strides_[i];
-    }
-
+    size_t index_i = (index % stride_second_to_last_axis) / stride_to_last_axis;
     if ((0 < index_i) && (index_i < double_length_ + 1)) {
       input_[index] *= static_cast<T>(2);
     }
diff --git a/paddle/fluid/operators/fused/fused_elemwise_activation_op.h b/paddle/phi/kernels/funcs/fused_elemwise_activation_functor.h
similarity index 62%
rename from paddle/fluid/operators/fused/fused_elemwise_activation_op.h
rename to paddle/phi/kernels/funcs/fused_elemwise_activation_functor.h
index a271a87d9eb35..5568611708339 100644
--- a/paddle/fluid/operators/fused/fused_elemwise_activation_op.h
+++ b/paddle/phi/kernels/funcs/fused_elemwise_activation_functor.h
@@ -1,69 +1,158 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 
 #pragma once
 
+#include <memory>
 #include <string>
+#include <unordered_set>
 #include <vector>
 
-#include "paddle/fluid/framework/op_desc.h"
-#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/compound_functors.h"
 #include "paddle/phi/kernels/funcs/elementwise/elementwise_op_function.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/funcs/functors.h"
 
-namespace paddle {
-namespace operators {
+namespace phi {
+namespace funcs {
+
+static inline bool IsBcastY(const phi::DDim &x_dim, const phi::DDim &y_dim) {
+  bool bcast_y = x_dim.size() >= y_dim.size();
+  if (x_dim.size() == y_dim.size()) {
+    for (int i = 0; i < x_dim.size(); ++i) {
+      if (x_dim[i] < y_dim[i]) {
+        bcast_y = false;
+        break;
+      }
+    }
+  }
+  return bcast_y;
+}
 
 /**
  * Whether the compound function is Unary(Binary(X, Y)).
  * For Unary(Binary(X, Y)), the intermediate_out's shape is the same the final
  * out.
  */
-bool IsUnaryCompound(const std::vector<std::string> &functor_list);
+static inline bool IsUnaryCompound(
+    const std::vector<std::string> &functor_list) {
+  PADDLE_ENFORCE_EQ(
+      functor_list.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "Invalid functor list size %d, which should be equal to %d.",
+          functor_list.size(),
+          2));
+  static std::unordered_set<std::string> binary_fun = {"elementwise_add",
+                                                       "elementwise_mul",
+                                                       "elementwise_add_grad",
+                                                       "elementwise_mul_grad"};
+  return binary_fun.count(functor_list[1]) != 0;
+}
 
 /**
  *  For the in-place unary functor, the inputs of op_desc only have Out and
  *  Out@Grad.
  */
-bool HasInPlaceUnary(const std::vector<std::string> &functor_list);
+static inline bool HasInPlaceUnary(
+    const std::vector<std::string> &functor_list) {
+  PADDLE_ENFORCE_EQ(
+      functor_list.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "Invalid functor list size %d, which should be equal to %d.",
+          functor_list.size(),
+          2));
+  static std::unordered_set<std::string> InplaceOpSet = {"relu", "relu_grad"};
+  bool is_in_place = false;
+  for (auto &func_name : functor_list) {
+    is_in_place |= (InplaceOpSet.count(func_name) == 1);
+  }
+  return is_in_place;
+}
 
 /**
  * Whether the Input(X) could be absent.
  */
-bool InputXCanBeAbsent(const std::vector<std::string> &functor_list);
+static inline bool InputXCanBeAbsent(
+    const std::vector<std::string> &functor_list) {
+  PADDLE_ENFORCE_EQ(
+      functor_list.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "Invalid functor list size %d, which should be equal to %d.",
+          functor_list.size(),
+          2));
+  static std::unordered_set<std::string> binary_fun = {"elementwise_add_grad"};
+  return binary_fun.count(functor_list[0]) != 0 ||
+         binary_fun.count(functor_list[1]) != 0;
+}
+
+/*
+ * Whether the compound function is supported.
+ * For Unary(Binary(X, Y)), the intermediate_out's shape is the same the final
+ * out.
+ */
+static bool IsSupportedCompound(const std::vector<std::string> &functors) {
+  PADDLE_ENFORCE_EQ(
+      functors.size(),
+      2UL,
+      phi::errors::InvalidArgument(
+          "Invalid functor list size %d, which should be equal to %d.",
+          functors.size(),
+          2));
+
+  static std::unordered_set<std::string> unary_fun = {
+      "scale", "relu", "tanh", "sigmoid", "gelu"};
+  static std::unordered_set<std::string> binary_fun = {"elementwise_add",
+                                                       "elementwise_mul"};
+
+  std::string unary_fun_str;
+  if (binary_fun.count(functors[0])) {
+    unary_fun_str = functors[1];
+  } else if (binary_fun.count(functors[1])) {
+    unary_fun_str = functors[0];
+  } else {
+    PADDLE_THROW(phi::errors::InvalidArgument(
+        "%s and %s are not included in fused_list.", functors[0], functors[1]));
+  }
+  PADDLE_ENFORCE_EQ(unary_fun.count(unary_fun_str),
+                    1,
+                    phi::errors::InvalidArgument(
+                        "%s is not included in fused_list.", unary_fun_str));
+  return true;
+}
 
 template <typename DeviceContext,
           typename T,
           typename BinaryFunctor,
           typename UnaryFunctor>
-static void RunBinaryCompoundFunctor(const framework::ExecutionContext &ctx,
-                                     const BinaryFunctor &binary_functor,
-                                     const UnaryFunctor &unary_functor,
-                                     const phi::DenseTensor &in_x,
-                                     const phi::DenseTensor &in_y,
-                                     std::vector<phi::DenseTensor *> *outputs) {
+void RunBinaryCompoundFunctor(const DeviceContext &dev_ctx,
+                              const BinaryFunctor &binary_functor,
+                              const UnaryFunctor &unary_functor,
+                              const phi::DenseTensor &in_x,
+                              const phi::DenseTensor &in_y,
+                              std::vector<phi::DenseTensor *> *outputs,
+                              int axis,
+                              bool save_intermediate_out) {
   // Z = Binary(X, Unary(Y))
   // intermediate_out = Unary(Y)
   // out = Binary(X, Unary(Y))
   // In this case, the shape of intermediate_out and out are different.
   phi::funcs::BinaryCompoundFunctor<T, BinaryFunctor, UnaryFunctor>
       compound_func(binary_functor, unary_functor);
-  int axis = ctx.Attr<int>("axis");
-  auto &dev_ctx = ctx.template device_context<DeviceContext>();
-  if (ctx.Attr<bool>("save_intermediate_out")) {
+  if (save_intermediate_out) {
     phi::funcs::FusedElemwiseAndActComputeEx<
         DeviceContext,
         T,
@@ -86,22 +175,23 @@ template <typename DeviceContext,
           typename T,
           typename UnaryFunctor,
           typename BinaryFunctor>
-static void RunUnaryCompoundFunctors(const framework::ExecutionContext &ctx,
-                                     const UnaryFunctor &unary_functor,
-                                     const BinaryFunctor &binary_functor,
-                                     const phi::DenseTensor &in_x,
-                                     const phi::DenseTensor &in_y,
-                                     std::vector<phi::DenseTensor *> *outputs) {
+void RunUnaryCompoundFunctors(const DeviceContext &dev_ctx,
+                              const UnaryFunctor &unary_functor,
+                              const BinaryFunctor &binary_functor,
+                              const phi::DenseTensor &in_x,
+                              const phi::DenseTensor &in_y,
+                              std::vector<phi::DenseTensor *> *outputs,
+                              int axis,
+                              bool save_intermediate_out) {
   // Z = Unary(Binary(X, Y))
   // intermediate_out = Binary(X, Y)
   // out = Unary(Binary(X, Y))
   // In this case, the shape of intermediate_out and out are the same.
-  int axis = ctx.Attr<int>("axis");
 
   phi::funcs::UnaryCompoundFunctor<T, UnaryFunctor, BinaryFunctor>
       compound_func(unary_functor, binary_functor);
-  auto &dev_ctx = ctx.template device_context<DeviceContext>();
-  if (ctx.Attr<bool>("save_intermediate_out")) {
+
+  if (save_intermediate_out) {
     phi::funcs::FusedElemwiseAndActComputeEx<
         DeviceContext,
         T,
@@ -126,21 +216,20 @@ template <typename DeviceContext,
           typename UnaryFunctor,
           typename UnaryGradFunctor,
           bool InPlace>
-static void RunBinaryCompoundGradFunctors(
-    const framework::ExecutionContext &ctx,
-    const BinaryGradFunctor &binary_grad_functor,
-    const UnaryFunctor &unary_functor,
-    const UnaryGradFunctor &unary_grad_functor,
-    const phi::DenseTensor *in_x,
-    const phi::DenseTensor *in_y,
-    const phi::DenseTensor *in_out,
-    const phi::DenseTensor *in_intermediate_out,
-    const phi::DenseTensor *in_out_grad,
-    phi::DenseTensor *x_grad,
-    phi::DenseTensor *y_grad,
-    phi::DenseTensor *d_intermediate_out) {
+void RunBinaryCompoundGradFunctors(const DeviceContext &dev_ctx,
+                                   const BinaryGradFunctor &binary_grad_functor,
+                                   const UnaryFunctor &unary_functor,
+                                   const UnaryGradFunctor &unary_grad_functor,
+                                   const phi::DenseTensor *in_x,
+                                   const phi::DenseTensor *in_y,
+                                   const phi::DenseTensor *in_out,
+                                   const phi::DenseTensor *in_intermediate_out,
+                                   const phi::DenseTensor *in_out_grad,
+                                   phi::DenseTensor *x_grad,
+                                   phi::DenseTensor *y_grad,
+                                   phi::DenseTensor *d_intermediate_out,
+                                   int axis) {
   // Z = Binary(X, Unary(Y))
-  int axis = ctx.Attr<int>("axis");
 
   using BinaryCompoundDxFunctor = phi::funcs::
       BinaryCompoundGradDxFunctor<T, BinaryGradFunctor, UnaryFunctor>;
@@ -155,7 +244,6 @@ static void RunBinaryCompoundGradFunctors(
                                                             BinaryGradFunctor,
                                                             UnaryFunctor>;
 
-  auto &dev_ctx = ctx.template device_context<DeviceContext>();
   if (in_intermediate_out) {
     phi::funcs::FusedElemwiseAndActGradComputeEx<
         DeviceContext,
@@ -213,21 +301,20 @@ template <typename DeviceContext,
           typename BinaryFunctor,
           typename BinaryGradFunctor,
           bool InPlace>
-static void RunUnaryCompoundGradFunctors(
-    const framework::ExecutionContext &ctx,
-    const UnaryGradFunctor &unary_grad_functor,
-    const BinaryFunctor &binary_functor,
-    const BinaryGradFunctor &binary_grad_functor,
-    const phi::DenseTensor *in_x,
-    const phi::DenseTensor *in_y,
-    const phi::DenseTensor *in_out,
-    const phi::DenseTensor *in_intermediate_out,
-    const phi::DenseTensor *in_out_grad,
-    phi::DenseTensor *x_grad,
-    phi::DenseTensor *y_grad,
-    phi::DenseTensor *d_intermediate_out) {
+void RunUnaryCompoundGradFunctors(const DeviceContext &dev_ctx,
+                                  const UnaryGradFunctor &unary_grad_functor,
+                                  const BinaryFunctor &binary_functor,
+                                  const BinaryGradFunctor &binary_grad_functor,
+                                  const phi::DenseTensor *in_x,
+                                  const phi::DenseTensor *in_y,
+                                  const phi::DenseTensor *in_out,
+                                  const phi::DenseTensor *in_intermediate_out,
+                                  const phi::DenseTensor *in_out_grad,
+                                  phi::DenseTensor *x_grad,
+                                  phi::DenseTensor *y_grad,
+                                  phi::DenseTensor *d_intermediate_out,
+                                  int axis) {
   // Z = Unary(Binary(X, Y))
-  int axis = ctx.Attr<int>("axis");
 
   using UnaryCompoundDxFunctor =
       phi::funcs::UnaryCompoundGradDxFunctor<T,
@@ -247,7 +334,6 @@ static void RunUnaryCompoundGradFunctors(
                                                         BinaryFunctor,
                                                         InPlace>;
 
-  auto &dev_ctx = ctx.template device_context<DeviceContext>();
   if (in_intermediate_out) {
     phi::funcs::FusedElemwiseAndActGradComputeEx<
         DeviceContext,
@@ -300,125 +386,147 @@ static void RunUnaryCompoundGradFunctors(
 }
 
 template <typename DeviceContext, typename T>
-static void RunFunctors(const framework::ExecutionContext &ctx,
-                        const phi::DenseTensor &in_x,
-                        const phi::DenseTensor &in_y,
-                        std::vector<phi::DenseTensor *> *outputs) {
-  auto &functors = ctx.Attr<std::vector<std::string>>("functor_list");
+void RunFunctors(const DeviceContext &dev_ctx,
+                 const phi::DenseTensor &in_x,
+                 const phi::DenseTensor &in_y,
+                 std::vector<phi::DenseTensor *> *outputs,
+                 std::vector<std::string> functor_list,
+                 float in_scale,
+                 int axis,
+                 bool save_intermediate_out) {
+  auto &functors = functor_list;
 
   // TODO(zcd): The following code can be refined.
   auto funcs_str = functors[0] + "," + functors[1];
   if (funcs_str == "elementwise_add,scale") {
     // Z = Binary(X, Unary(Y))
-    T scale = static_cast<T>(ctx.Attr<float>("scale"));
+    T scale = static_cast<T>(in_scale);
     RunBinaryCompoundFunctor<DeviceContext,
                              T,
                              phi::funcs::AddFunctor<T>,
                              phi::funcs::ScaleFunctor<T>>(
-        ctx,
+        dev_ctx,
         phi::funcs::AddFunctor<T>(),
         phi::funcs::ScaleFunctor<T>(scale),
         in_x,
         in_y,
-        outputs);
+        outputs,
+        axis,
+        save_intermediate_out);
   } else if (funcs_str == "scale,elementwise_add") {
     // Z = Unary(Binary(X, Y))
-    T scale = static_cast<T>(ctx.Attr<float>("scale"));
+    T scale = static_cast<T>(in_scale);
     RunUnaryCompoundFunctors<DeviceContext,
                              T,
                              phi::funcs::ScaleFunctor<T>,
                              phi::funcs::AddFunctor<T>>(
-        ctx,
+        dev_ctx,
         phi::funcs::ScaleFunctor<T>(scale),
         phi::funcs::AddFunctor<T>(),
         in_x,
         in_y,
-        outputs);
+        outputs,
+        axis,
+        save_intermediate_out);
   } else if (funcs_str == "elementwise_add,relu") {
     // Z = Binary(X, Unary(Y))
     RunBinaryCompoundFunctor<DeviceContext,
                              T,
                              phi::funcs::AddFunctor<T>,
                              phi::funcs::ReluFunctor<T>>(
-        ctx,
+        dev_ctx,
         phi::funcs::AddFunctor<T>(),
         phi::funcs::ReluFunctor<T>(),
         in_x,
         in_y,
-        outputs);
+        outputs,
+        axis,
+        save_intermediate_out);
   } else if (funcs_str == "relu,elementwise_add") {
     // Z = Unary(Binary(X, Y))
     RunUnaryCompoundFunctors<DeviceContext,
                              T,
                              phi::funcs::ReluFunctor<T>,
                              phi::funcs::AddFunctor<T>>(
-        ctx,
+        dev_ctx,
         phi::funcs::ReluFunctor<T>(),
         phi::funcs::AddFunctor<T>(),
         in_x,
         in_y,
-        outputs);
+        outputs,
+        axis,
+        save_intermediate_out);
   } else if (funcs_str == "elementwise_mul,scale") {
     // Z = Binary(X, Unary(Y))
-    T scale = static_cast<T>(ctx.Attr<float>("scale"));
+    T scale = static_cast<T>(in_scale);
     RunBinaryCompoundFunctor<DeviceContext,
                              T,
                              phi::funcs::MultiplyFunctor<T>,
                              phi::funcs::ScaleFunctor<T>>(
-        ctx,
+        dev_ctx,
         phi::funcs::MultiplyFunctor<T>(),
         phi::funcs::ScaleFunctor<T>(scale),
         in_x,
         in_y,
-        outputs);
+        outputs,
+        axis,
+        save_intermediate_out);
   } else if (funcs_str == "tanh,elementwise_add") {
     // Z = Unary(Binary(X, Y))
     RunUnaryCompoundFunctors<DeviceContext,
                              T,
                              phi::funcs::TanhFunctor<T>,
                              phi::funcs::AddFunctor<T>>(
-        ctx,
+        dev_ctx,
         phi::funcs::TanhFunctor<T>(),
         phi::funcs::AddFunctor<T>(),
         in_x,
         in_y,
-        outputs);
+        outputs,
+        axis,
+        save_intermediate_out);
   } else if (funcs_str == "elementwise_mul,tanh") {
     // Z = Binary(X, Unary(Y))
     RunBinaryCompoundFunctor<DeviceContext,
                              T,
                              phi::funcs::MultiplyFunctor<T>,
                              phi::funcs::TanhFunctor<T>>(
-        ctx,
+        dev_ctx,
         phi::funcs::MultiplyFunctor<T>(),
         phi::funcs::TanhFunctor<T>(),
         in_x,
         in_y,
-        outputs);
+        outputs,
+        axis,
+        save_intermediate_out);
   } else if (funcs_str == "elementwise_mul,sigmoid") {
     // Z = Binary(X, Unary(Y))
     RunBinaryCompoundFunctor<DeviceContext,
                              T,
                              phi::funcs::MultiplyFunctor<T>,
                              phi::funcs::SigmoidFunctor<T>>(
-        ctx,
+        dev_ctx,
         phi::funcs::MultiplyFunctor<T>(),
         phi::funcs::SigmoidFunctor<T>(),
         in_x,
         in_y,
-        outputs);
+        outputs,
+        axis,
+        save_intermediate_out);
   } else if (funcs_str == "gelu,elementwise_add") {
     // Z = Unary(Binary(X, Y))
     RunUnaryCompoundFunctors<DeviceContext,
                              T,
                              phi::funcs::GeluFunctor<T>,
                              phi::funcs::AddFunctor<T>>(
-        ctx,
+        dev_ctx,
         phi::funcs::GeluFunctor<T>(),
         phi::funcs::AddFunctor<T>(),
         in_x,
         in_y,
-        outputs);
+        outputs,
+        axis,
+        save_intermediate_out);
   } else {
     PADDLE_THROW(phi::errors::InvalidArgument("%s has not been implemented.",
                                               funcs_str));
@@ -426,28 +534,31 @@ static void RunFunctors(const framework::ExecutionContext &ctx,
 }
 
 template <typename DeviceContext, typename T, bool InPlace>
-static void RunGradFunctors(const framework::ExecutionContext &ctx,
-                            const phi::DenseTensor *in_x,
-                            const phi::DenseTensor *in_y,
-                            const phi::DenseTensor *in_out,
-                            const phi::DenseTensor *in_intermediate_out,
-                            const phi::DenseTensor *in_out_grad,
-                            phi::DenseTensor *x_grad,
-                            phi::DenseTensor *y_grad,
-                            phi::DenseTensor *d_intermediate_out) {
-  auto &functors = ctx.Attr<std::vector<std::string>>("functor_list");
+void RunGradFunctors(const DeviceContext &dev_ctx,
+                     const phi::DenseTensor *in_x,
+                     const phi::DenseTensor *in_y,
+                     const phi::DenseTensor *in_out,
+                     const phi::DenseTensor *in_intermediate_out,
+                     const phi::DenseTensor *in_out_grad,
+                     phi::DenseTensor *x_grad,
+                     phi::DenseTensor *y_grad,
+                     phi::DenseTensor *d_intermediate_out,
+                     std::vector<std::string> functor_list,
+                     float in_scale,
+                     int axis) {
+  auto &functors = functor_list;
   auto funcs_str = functors[0] + "," + functors[1];
 
   if (funcs_str == "elementwise_add_grad,scale_grad") {
     // The backward of Z = Binary(X, Unary(Y))
-    T scale = static_cast<T>(ctx.Attr<float>("scale"));
+    T scale = static_cast<T>(in_scale);
     RunBinaryCompoundGradFunctors<DeviceContext,
                                   T,
                                   phi::funcs::AddGradFunctor<T>,
                                   phi::funcs::ScaleFunctor<T>,
                                   phi::funcs::ScaleGradFunctor<T>,
                                   InPlace>(
-        ctx,
+        dev_ctx,
         phi::funcs::AddGradFunctor<T>(),
         phi::funcs::ScaleFunctor<T>(scale),
         phi::funcs::ScaleGradFunctor<T>(scale),
@@ -458,17 +569,18 @@ static void RunGradFunctors(const framework::ExecutionContext &ctx,
         in_out_grad,
         x_grad,
         y_grad,
-        d_intermediate_out);
+        d_intermediate_out,
+        axis);
   } else if (funcs_str == "scale_grad,elementwise_add_grad") {
     // The backward of Z = Unary(Binary(X, Y))
-    T scale = static_cast<T>(ctx.Attr<float>("scale"));
+    T scale = static_cast<T>(in_scale);
     RunUnaryCompoundGradFunctors<DeviceContext,
                                  T,
                                  phi::funcs::ScaleGradFunctor<T>,
                                  phi::funcs::AddFunctor<T>,
                                  phi::funcs::AddGradFunctor<T>,
                                  InPlace>(
-        ctx,
+        dev_ctx,
         phi::funcs::ScaleGradFunctor<T>(scale),
         phi::funcs::AddFunctor<T>(),
         phi::funcs::AddGradFunctor<T>(),
@@ -479,7 +591,8 @@ static void RunGradFunctors(const framework::ExecutionContext &ctx,
         in_out_grad,
         x_grad,
         y_grad,
-        d_intermediate_out);
+        d_intermediate_out,
+        axis);
   } else if (funcs_str == "elementwise_add_grad,relu_grad") {
     // The backward of Z = Binary(X, Unary(Y))
     RunBinaryCompoundGradFunctors<DeviceContext,
@@ -487,7 +600,7 @@ static void RunGradFunctors(const framework::ExecutionContext &ctx,
                                   phi::funcs::AddGradFunctor<T>,
                                   phi::funcs::ReluFunctor<T>,
                                   phi::funcs::ReluGradFunctor<T>,
-                                  InPlace>(ctx,
+                                  InPlace>(dev_ctx,
                                            phi::funcs::AddGradFunctor<T>(),
                                            phi::funcs::ReluFunctor<T>(),
                                            phi::funcs::ReluGradFunctor<T>(),
@@ -498,7 +611,8 @@ static void RunGradFunctors(const framework::ExecutionContext &ctx,
                                            in_out_grad,
                                            x_grad,
                                            y_grad,
-                                           d_intermediate_out);
+                                           d_intermediate_out,
+                                           axis);
   } else if (funcs_str == "relu_grad,elementwise_add_grad") {
     // The backward of Z = Unary(Binary(X, Y))
     RunUnaryCompoundGradFunctors<DeviceContext,
@@ -506,7 +620,7 @@ static void RunGradFunctors(const framework::ExecutionContext &ctx,
                                  phi::funcs::ReluGradFunctor<T>,
                                  phi::funcs::AddFunctor<T>,
                                  phi::funcs::AddGradFunctor<T>,
-                                 InPlace>(ctx,
+                                 InPlace>(dev_ctx,
                                           phi::funcs::ReluGradFunctor<T>(),
                                           phi::funcs::AddFunctor<T>(),
                                           phi::funcs::AddGradFunctor<T>(),
@@ -517,17 +631,18 @@ static void RunGradFunctors(const framework::ExecutionContext &ctx,
                                           in_out_grad,
                                           x_grad,
                                           y_grad,
-                                          d_intermediate_out);
+                                          d_intermediate_out,
+                                          axis);
   } else if (funcs_str == "elementwise_mul_grad,scale_grad") {
     // The backward of Z = Binary(X, Unary(Y))
-    T scale = static_cast<T>(ctx.Attr<float>("scale"));
+    T scale = static_cast<T>(in_scale);
     RunBinaryCompoundGradFunctors<DeviceContext,
                                   T,
                                   phi::funcs::MulGradFunctor<T>,
                                   phi::funcs::ScaleFunctor<T>,
                                   phi::funcs::ScaleGradFunctor<T>,
                                   InPlace>(
-        ctx,
+        dev_ctx,
         phi::funcs::MulGradFunctor<T>(),
         phi::funcs::ScaleFunctor<T>(scale),
         phi::funcs::ScaleGradFunctor<T>(scale),
@@ -538,7 +653,8 @@ static void RunGradFunctors(const framework::ExecutionContext &ctx,
         in_out_grad,
         x_grad,
         y_grad,
-        d_intermediate_out);
+        d_intermediate_out,
+        axis);
   } else if (funcs_str == "tanh_grad,elementwise_add_grad") {
     // The backward of Z = Unary(Binary(X, Y))
     RunUnaryCompoundGradFunctors<DeviceContext,
@@ -546,7 +662,7 @@ static void RunGradFunctors(const framework::ExecutionContext &ctx,
                                  phi::funcs::TanhGradFunctor<T>,
                                  phi::funcs::AddFunctor<T>,
                                  phi::funcs::AddGradFunctor<T>,
-                                 InPlace>(ctx,
+                                 InPlace>(dev_ctx,
                                           phi::funcs::TanhGradFunctor<T>(),
                                           phi::funcs::AddFunctor<T>(),
                                           phi::funcs::AddGradFunctor<T>(),
@@ -557,7 +673,8 @@ static void RunGradFunctors(const framework::ExecutionContext &ctx,
                                           in_out_grad,
                                           x_grad,
                                           y_grad,
-                                          d_intermediate_out);
+                                          d_intermediate_out,
+                                          axis);
   } else if (funcs_str == "elementwise_mul_grad,tanh_grad") {
     // The backward of Z = Binary(X, Unary(Y))
     RunBinaryCompoundGradFunctors<DeviceContext,
@@ -565,7 +682,7 @@ static void RunGradFunctors(const framework::ExecutionContext &ctx,
                                   phi::funcs::MulGradFunctor<T>,
                                   phi::funcs::TanhFunctor<T>,
                                   phi::funcs::TanhGradFunctor<T>,
-                                  InPlace>(ctx,
+                                  InPlace>(dev_ctx,
                                            phi::funcs::MulGradFunctor<T>(),
                                            phi::funcs::TanhFunctor<T>(),
                                            phi::funcs::TanhGradFunctor<T>(),
@@ -576,7 +693,8 @@ static void RunGradFunctors(const framework::ExecutionContext &ctx,
                                            in_out_grad,
                                            x_grad,
                                            y_grad,
-                                           d_intermediate_out);
+                                           d_intermediate_out,
+                                           axis);
   } else if (funcs_str == "elementwise_mul_grad,sigmoid_grad") {
     // The backward of Z = Binary(X, Unary(Y))
     RunBinaryCompoundGradFunctors<DeviceContext,
@@ -584,7 +702,7 @@ static void RunGradFunctors(const framework::ExecutionContext &ctx,
                                   phi::funcs::MulGradFunctor<T>,
                                   phi::funcs::SigmoidFunctor<T>,
                                   phi::funcs::SigmoidGradFunctor<T>,
-                                  InPlace>(ctx,
+                                  InPlace>(dev_ctx,
                                            phi::funcs::MulGradFunctor<T>(),
                                            phi::funcs::SigmoidFunctor<T>(),
                                            phi::funcs::SigmoidGradFunctor<T>(),
@@ -595,7 +713,8 @@ static void RunGradFunctors(const framework::ExecutionContext &ctx,
                                            in_out_grad,
                                            x_grad,
                                            y_grad,
-                                           d_intermediate_out);
+                                           d_intermediate_out,
+                                           axis);
   } else if (funcs_str == "gelu_grad,elementwise_add_grad") {
     // The backward of Z = Unary(Binary(X, Y))
     RunUnaryCompoundGradFunctors<DeviceContext,
@@ -603,7 +722,7 @@ static void RunGradFunctors(const framework::ExecutionContext &ctx,
                                  phi::funcs::GeluGradFunctor<T>,
                                  phi::funcs::AddFunctor<T>,
                                  phi::funcs::AddGradFunctor<T>,
-                                 InPlace>(ctx,
+                                 InPlace>(dev_ctx,
                                           phi::funcs::GeluGradFunctor<T>(),
                                           phi::funcs::AddFunctor<T>(),
                                           phi::funcs::AddGradFunctor<T>(),
@@ -614,170 +733,13 @@ static void RunGradFunctors(const framework::ExecutionContext &ctx,
                                           in_out_grad,
                                           x_grad,
                                           y_grad,
-                                          d_intermediate_out);
+                                          d_intermediate_out,
+                                          axis);
   } else {
     PADDLE_THROW(phi::errors::InvalidArgument("%s has not been implemented.",
                                               funcs_str));
   }
 }
 
-template <typename T, typename DeviceContext>
-class FusedElemwiseActivationKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto &in_x = GET_DATA_SAFELY(ctx.Input<phi::DenseTensor>("X"),
-                                 "Input",
-                                 "X",
-                                 "FusedElemwiseActivation");
-    auto &in_y = GET_DATA_SAFELY(ctx.Input<phi::DenseTensor>("Y"),
-                                 "Input",
-                                 "Y",
-                                 "FusedElemwiseActivation");
-
-    PADDLE_ENFORCE_EQ(
-        ctx.HasOutput("Out"),
-        true,
-        phi::errors::InvalidArgument("The output(Out) should not be empty"));
-    auto output = ctx.Output<phi::DenseTensor>("Out");
-
-    std::vector<phi::DenseTensor *> outputs;
-    outputs.emplace_back(output);
-
-    if (ctx.Attr<bool>("save_intermediate_out")) {
-      PADDLE_ENFORCE_EQ(ctx.HasOutput("IntermediateOut"),
-                        true,
-                        phi::errors::InvalidArgument(
-                            "The save_intermediate_out is enable, so the "
-                            "IntermediateOut should not be empty."));
-
-      auto intermediate_out = ctx.Output<phi::DenseTensor>("IntermediateOut");
-      outputs.emplace_back(intermediate_out);
-    } else {
-      outputs.emplace_back(nullptr);
-    }
-
-    RunFunctors<DeviceContext, T>(ctx, in_x, in_y, &outputs);
-  }
-};
-
-template <typename T, typename DeviceContext>
-class FusedElemwiseActivationGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto in_y = ctx.Input<phi::DenseTensor>("Y");
-    PADDLE_ENFORCE_NE(
-        in_y,
-        nullptr,
-        phi::errors::InvalidArgument("Input(Y) should not be nullptr."));
-    phi::DenseTensor *in_out =
-        const_cast<phi::DenseTensor *>(ctx.Input<phi::DenseTensor>("Out"));
-
-    auto in_out_grad =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    PADDLE_ENFORCE_NE(
-        in_out_grad,
-        nullptr,
-        phi::errors::InvalidArgument("Input(Out@Grad) should not be nullptr."));
-
-    phi::DenseTensor *in_x =
-        const_cast<phi::DenseTensor *>(ctx.Input<phi::DenseTensor>("X"));
-    phi::DenseTensor *x_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    phi::DenseTensor *y_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-    phi::DenseTensor *d_intermediate_out =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("IntermediateOut"));
-
-    auto functor_list = ctx.Attr<std::vector<std::string>>("functor_list");
-
-    // Get intermediate_out
-    phi::DenseTensor *in_intermediate_out = nullptr;
-    if (ctx.Attr<bool>("save_intermediate_out")) {
-      // if save_intermediate_out is true, for Unary(Binary(x, y)) and
-      // Binary(x, Unary(y)), the Binary(x, y) and Unary(y) not need to
-      // recompute.
-      in_intermediate_out = const_cast<phi::DenseTensor *>(
-          ctx.Input<phi::DenseTensor>("IntermediateOut"));
-      PADDLE_ENFORCE_NE(in_intermediate_out,
-                        nullptr,
-                        phi::errors::InvalidArgument(
-                            "The option of 'save_intermediate_out' is opened,"
-                            " so the number of 'Out' should be two."));
-    } else {
-      if (!InputXCanBeAbsent(functor_list)) {
-        PADDLE_ENFORCE_NE(
-            in_x,
-            nullptr,
-            phi::errors::InvalidArgument("Input(X) should not be null."));
-      }
-    }
-
-    // Get in_x
-    if (ctx.HasInput("X")) {
-      PADDLE_ENFORCE_NE(
-          in_x,
-          nullptr,
-          phi::errors::InvalidArgument("Input(X) should not be null."));
-    } else {
-      // If functor_list contains elementwise_add, the backward doesn't use
-      // in_x, in_y and in_out.
-      PADDLE_ENFORCE_EQ(InputXCanBeAbsent(functor_list),
-                        true,
-                        phi::errors::InvalidArgument(
-                            "Only when the compoundfunctor contains "
-                            "elementwise_add_grad, the 'X' could be absent."));
-      in_x = const_cast<phi::DenseTensor *>(in_out_grad);
-    }
-
-    // Get in_Out
-    if (ctx.HasInput("Out")) {
-      PADDLE_ENFORCE_NE(
-          in_out,
-          nullptr,
-          phi::errors::InvalidArgument("Input(X) should not be null."));
-    } else {
-      // If functor_list contains elementwise_add, the backward doesn't use
-      // in_x, in_y and in_out.
-      PADDLE_ENFORCE_EQ(InputXCanBeAbsent(functor_list),
-                        true,
-                        phi::errors::InvalidArgument(
-                            "Only when the compoundfunctor contains "
-                            "elementwise_add_grad, the 'X' could be absent."));
-      in_out = const_cast<phi::DenseTensor *>(in_out_grad);
-    }
-
-    bool has_in_place = HasInPlaceUnary(functor_list);
-    if (has_in_place) {
-      RunGradFunctors<DeviceContext, T, true /*InPlace*/>(ctx,
-                                                          in_x,
-                                                          in_y,
-                                                          in_out,
-                                                          in_intermediate_out,
-                                                          in_out_grad,
-                                                          x_grad,
-                                                          y_grad,
-                                                          d_intermediate_out);
-    } else {
-      RunGradFunctors<DeviceContext, T, false /*InPlace*/>(ctx,
-                                                           in_x,
-                                                           in_y,
-                                                           in_out,
-                                                           in_intermediate_out,
-                                                           in_out_grad,
-                                                           x_grad,
-                                                           y_grad,
-                                                           d_intermediate_out);
-    }
-  }
-};
-
-template <typename T, typename DeviceContext>
-class FusedElemwiseAddActivationKernel
-    : public FusedElemwiseActivationKernel<T, DeviceContext> {};
-
-template <typename T, typename DeviceContext>
-class FusedElemwiseAddActivationGradKernel
-    : public FusedElemwiseActivationGradKernel<T, DeviceContext> {};
-
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/gather_scatter_functor.cc b/paddle/phi/kernels/funcs/gather_scatter_functor.cc
index ca6c44dbdbd76..df0cf3ac9be33 100644
--- a/paddle/phi/kernels/funcs/gather_scatter_functor.cc
+++ b/paddle/phi/kernels/funcs/gather_scatter_functor.cc
@@ -18,8 +18,7 @@ limitations under the License. */
 
 #include "paddle/common/macros.h"
 
-namespace phi {
-namespace funcs {
+namespace phi::funcs {
 
 class TensorAssign {
  public:
@@ -705,5 +704,4 @@ Instantiate_Template_Function(cpu_gather_kernel)                  // NOLINT
     Instantiate_Template_Function_With_Out(
         cpu_scatter_mul_min_max_value_grad_kernel)  // NOLINT
 
-}  // namespace funcs
-}  // namespace phi
+}  // namespace phi::funcs
diff --git a/paddle/phi/kernels/funcs/gpc.cc b/paddle/phi/kernels/funcs/gpc.cc
index ba24dbb442dfa..397f0d23ab12f 100644
--- a/paddle/phi/kernels/funcs/gpc.cc
+++ b/paddle/phi/kernels/funcs/gpc.cc
@@ -28,8 +28,7 @@
 
 #include "paddle/phi/core/enforce.h"
 
-namespace phi {
-namespace funcs {
+namespace phi::funcs {
 
 typedef struct lmt_shape { /* Local minima table                */
   double y;                /* Y coordinate at local minimum     */
@@ -2263,5 +2262,4 @@ void gpc_tristrip_clip(gpc_op op,
   gpc_free<double>(sbt);
 }  // NOLINT
 
-}  // namespace funcs
-}  // namespace phi
+}  // namespace phi::funcs
diff --git a/paddle/phi/kernels/funcs/gru_compute.cc b/paddle/phi/kernels/funcs/gru_compute.cc
index f0c946134906b..563c5a2d34fe2 100644
--- a/paddle/phi/kernels/funcs/gru_compute.cc
+++ b/paddle/phi/kernels/funcs/gru_compute.cc
@@ -15,8 +15,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h"
 #include "paddle/phi/kernels/funcs/detail/gru_kernel.h"
 
-namespace phi {
-namespace funcs {
+namespace phi::funcs {
 
 template <typename T>
 struct GRUUnitFunctor<phi::CPUContext, T> {
@@ -364,5 +363,4 @@ template struct GRUUnitFunctorV2<CPUContext, double>;
 template struct GRUUnitGradFunctorV2<CPUContext, float>;
 template struct GRUUnitGradFunctorV2<CPUContext, double>;
 
-}  // namespace funcs
-}  // namespace phi
+}  // namespace phi::funcs
diff --git a/paddle/phi/kernels/funcs/jit/gen/adamw.cc b/paddle/phi/kernels/funcs/jit/gen/adamw.cc
index 9426ea16a88fb..4a8545c24f964 100644
--- a/paddle/phi/kernels/funcs/jit/gen/adamw.cc
+++ b/paddle/phi/kernels/funcs/jit/gen/adamw.cc
@@ -19,9 +19,7 @@
 #include "paddle/phi/backends/cpu/cpu_info.h"
 #include "paddle/phi/kernels/funcs/jit/registry.h"
 
-namespace phi {
-namespace jit {
-namespace gen {
+namespace phi::jit::gen {
 
 void AdamWJitCode::loadArgs() {
   static constexpr int32_t one_as_float = 0x3f800000;
@@ -155,9 +153,7 @@ class AdamWCreator : public JitCodeCreator<int> {
   }
 };
 
-}  // namespace gen
-}  // namespace jit
-}  // namespace phi
+}  // namespace phi::jit::gen
 
 namespace gen = phi::jit::gen;
 
diff --git a/paddle/phi/kernels/funcs/jit/gen/blas.cc b/paddle/phi/kernels/funcs/jit/gen/blas.cc
index 1e29b7f4953fe..f9de8a6d01f45 100644
--- a/paddle/phi/kernels/funcs/jit/gen/blas.cc
+++ b/paddle/phi/kernels/funcs/jit/gen/blas.cc
@@ -18,9 +18,7 @@
 #include "paddle/phi/kernels/funcs/jit/macro.h"
 #include "paddle/phi/kernels/funcs/jit/registry.h"
 
-namespace phi {
-namespace jit {
-namespace gen {
+namespace phi::jit::gen {
 
 void VXXJitCode::genCode() {
   // do not need push stack, and do not need save avx512reg if do not use avx512
@@ -134,9 +132,7 @@ DECLARE_BLAS_CREATOR(VAddBias);
 
 #undef DECLARE_BLAS_CREATOR
 
-}  // namespace gen
-}  // namespace jit
-}  // namespace phi
+}  // namespace phi::jit::gen
 
 namespace gen = phi::jit::gen;
 
diff --git a/paddle/phi/kernels/funcs/jit/gen/gru.cc b/paddle/phi/kernels/funcs/jit/gen/gru.cc
index 33dfaa6cd097c..69f1e62fbfcf0 100644
--- a/paddle/phi/kernels/funcs/jit/gen/gru.cc
+++ b/paddle/phi/kernels/funcs/jit/gen/gru.cc
@@ -20,9 +20,7 @@
 #include "paddle/phi/kernels/funcs/jit/macro.h"
 #include "paddle/phi/kernels/funcs/jit/registry.h"
 
-namespace phi {
-namespace jit {
-namespace gen {
+namespace phi::jit::gen {
 
 void GRUJitCode::genCode() {
   reg64_t reg_ptr_gates = rax;
@@ -107,9 +105,7 @@ DECLARE_GRU_CREATOR(GRUHtPart2);
 
 #undef DECLARE_GRU_CREATOR
 
-}  // namespace gen
-}  // namespace jit
-}  // namespace phi
+}  // namespace phi::jit::gen
 
 namespace gen = phi::jit::gen;
 
diff --git a/paddle/phi/kernels/funcs/jit/gen/seqpool.cc b/paddle/phi/kernels/funcs/jit/gen/seqpool.cc
index fca00feb5c49b..5f1edf194d252 100644
--- a/paddle/phi/kernels/funcs/jit/gen/seqpool.cc
+++ b/paddle/phi/kernels/funcs/jit/gen/seqpool.cc
@@ -18,9 +18,7 @@
 #include "paddle/phi/kernels/funcs/jit/gen/act.h"  // for exp_float_consts ones
 #include "paddle/phi/kernels/funcs/jit/registry.h"
 
-namespace phi {
-namespace jit {
-namespace gen {
+namespace phi::jit::gen {
 
 void SeqPoolJitCode::genCode() {
   constexpr int block = YMM_FLOAT_BLOCK;
@@ -85,9 +83,7 @@ class SeqPoolCreator : public JitCodeCreator<seq_pool_attr_t> {
   }
 };
 
-}  // namespace gen
-}  // namespace jit
-}  // namespace phi
+}  // namespace phi::jit::gen
 
 namespace gen = phi::jit::gen;
 
diff --git a/paddle/phi/kernels/funcs/jit/gen/vbroadcast.cc b/paddle/phi/kernels/funcs/jit/gen/vbroadcast.cc
index c52d7d50379b4..4b9944fb8b2ba 100644
--- a/paddle/phi/kernels/funcs/jit/gen/vbroadcast.cc
+++ b/paddle/phi/kernels/funcs/jit/gen/vbroadcast.cc
@@ -17,9 +17,7 @@
 #include "paddle/phi/backends/cpu/cpu_info.h"
 #include "paddle/phi/kernels/funcs/jit/registry.h"
 
-namespace phi {
-namespace jit {
-namespace gen {
+namespace phi::jit::gen {
 
 void VBroadcastJitCode::genCode() {
   preCode();
@@ -85,9 +83,7 @@ class VBroadcastCreator : public JitCodeCreator<int64_t> {
   }
 };
 
-}  // namespace gen
-}  // namespace jit
-}  // namespace phi
+}  // namespace phi::jit::gen
 
 namespace gen = phi::jit::gen;
 
diff --git a/paddle/phi/kernels/funcs/jit/kernel_pool.cc b/paddle/phi/kernels/funcs/jit/kernel_pool.cc
index 97a09bf48ba50..e850626101130 100644
--- a/paddle/phi/kernels/funcs/jit/kernel_pool.cc
+++ b/paddle/phi/kernels/funcs/jit/kernel_pool.cc
@@ -14,8 +14,7 @@
 
 #include "paddle/phi/kernels/funcs/jit/kernel_pool.h"
 
-namespace phi {
-namespace jit {
+namespace phi::jit {
 
 std::map<size_t, std::shared_ptr<void>>& GetJITCodesMap() {
   static thread_local std::map<size_t, std::shared_ptr<void>> g_jit_codes_map;
@@ -37,5 +36,4 @@ ReferKernelPool& ReferKernelPool::Instance() {
   return g_refer_kernel_pool;
 }
 
-}  // namespace jit
-}  // namespace phi
+}  // namespace phi::jit
diff --git a/paddle/phi/kernels/funcs/jit/more/intrinsic/crf_decoding.cc b/paddle/phi/kernels/funcs/jit/more/intrinsic/crf_decoding.cc
index c36ca0d7360cc..43a011277cb5f 100644
--- a/paddle/phi/kernels/funcs/jit/more/intrinsic/crf_decoding.cc
+++ b/paddle/phi/kernels/funcs/jit/more/intrinsic/crf_decoding.cc
@@ -19,10 +19,7 @@
 #include "paddle/phi/backends/cpu/cpu_info.h"
 #include "paddle/phi/kernels/funcs/jit/registry.h"
 
-namespace phi {
-namespace jit {
-namespace more {
-namespace intrinsic {
+namespace phi::jit::more::intrinsic {
 // Note: intrinsic code is not runtime build.
 // For example, if you build code on AVX, and run on AVX512 it can only use AVX
 
@@ -174,10 +171,7 @@ bool CRFDecodingKernel::CanBeUsed(const int& d) const {
   return phi::backends::cpu::MayIUse(phi::backends::cpu::avx) && d >= block;
 }
 
-}  // namespace intrinsic
-}  // namespace more
-}  // namespace jit
-}  // namespace phi
+}  // namespace phi::jit::more::intrinsic
 
 namespace intrinsic = phi::jit::more::intrinsic;
 
diff --git a/paddle/phi/kernels/funcs/jit/more/mix/mix.cc b/paddle/phi/kernels/funcs/jit/more/mix/mix.cc
index 7bb58a8b2463a..2c659111d435e 100644
--- a/paddle/phi/kernels/funcs/jit/more/mix/mix.cc
+++ b/paddle/phi/kernels/funcs/jit/more/mix/mix.cc
@@ -17,10 +17,7 @@
 #include "paddle/phi/kernels/funcs/jit/kernels.h"
 #include "paddle/phi/kernels/funcs/jit/registry.h"
 
-namespace phi {
-namespace jit {
-namespace more {
-namespace mix {
+namespace phi::jit::more::mix {
 
 using CPUPlace = phi::CPUPlace;
 
@@ -196,10 +193,7 @@ bool GRUHtPart1Kernel::CanBeUsed(const gru_attr_t& attr) const { return true; }
 
 bool GRUHtPart2Kernel::CanBeUsed(const gru_attr_t& attr) const { return true; }
 
-}  // namespace mix
-}  // namespace more
-}  // namespace jit
-}  // namespace phi
+}  // namespace phi::jit::more::mix
 
 namespace mix = phi::jit::more::mix;
 
diff --git a/paddle/phi/kernels/funcs/lapack/lapack_function.cc b/paddle/phi/kernels/funcs/lapack/lapack_function.cc
index 09d45fcf24be9..ebfd53291c36f 100644
--- a/paddle/phi/kernels/funcs/lapack/lapack_function.cc
+++ b/paddle/phi/kernels/funcs/lapack/lapack_function.cc
@@ -17,8 +17,7 @@
 #include "paddle/phi/backends/dynload/lapack.h"
 #include "paddle/phi/common/complex.h"
 
-namespace phi {
-namespace funcs {
+namespace phi::funcs {
 
 // LU (for example)
 template <>
@@ -537,5 +536,4 @@ void lapackSvd<float>(char jobz,
       &jobz, &m, &n, a, &lda, s, u, &ldu, vt, &ldvt, work, &lwork, iwork, info);
 }
 
-}  // namespace funcs
-}  // namespace phi
+}  // namespace phi::funcs
diff --git a/paddle/phi/kernels/funcs/lstm_utils.h b/paddle/phi/kernels/funcs/lstm_utils.h
new file mode 100644
index 0000000000000..4a02b097fd340
--- /dev/null
+++ b/paddle/phi/kernels/funcs/lstm_utils.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/mixed_vector.h"
+#include "paddle/phi/kernels/funcs/detail/activation_functions.h"
+#include "paddle/phi/kernels/funcs/sequence2batch.h"
+
+namespace phi {
+
+template <typename Context, typename T>
+inline void ReorderInitState(const Context& dev_ctx,
+                             const phi::DenseTensor& src,
+                             phi::Vector<size_t> index_lod,
+                             phi::DenseTensor* dst,
+                             bool indexed_src) {
+  phi::funcs::CopyMatrixRowsFunctor<Context, T> row_shuffle;
+  dst->Resize(src.dims());
+  dev_ctx.template Alloc<T>(dst);
+  row_shuffle(dev_ctx, src, index_lod, dst, indexed_src);
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/math/sampler.cc b/paddle/phi/kernels/funcs/math/sampler.cc
index b225674274a7b..46c20dc5a4727 100644
--- a/paddle/phi/kernels/funcs/math/sampler.cc
+++ b/paddle/phi/kernels/funcs/math/sampler.cc
@@ -18,8 +18,7 @@
 
 #include "paddle/phi/core/generator.h"
 
-namespace phi {
-namespace math {
+namespace phi::math {
 
 Sampler::~Sampler() = default;
 
@@ -93,5 +92,4 @@ int64_t CustomSampler::Sample() const {
 
 float CustomSampler::Probability(int64_t value) const { return probs_[value]; }
 
-}  // namespace math
-}  // namespace phi
+}  // namespace phi::math
diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cc b/paddle/phi/kernels/funcs/matrix_inverse.cc
index c316970e6a560..2a3749ef36b81 100644
--- a/paddle/phi/kernels/funcs/matrix_inverse.cc
+++ b/paddle/phi/kernels/funcs/matrix_inverse.cc
@@ -16,8 +16,7 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 
-namespace phi {
-namespace funcs {
+namespace phi::funcs {
 
 template <typename Context, typename T>
 void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
@@ -28,6 +27,7 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
 
 template class MatrixInverseFunctor<CPUContext, float>;
 template class MatrixInverseFunctor<CPUContext, double>;
+template class MatrixInverseFunctor<CPUContext, phi::dtype::complex<float>>;
+template class MatrixInverseFunctor<CPUContext, phi::dtype::complex<double>>;
 
-}  // namespace funcs
-}  // namespace phi
+}  // namespace phi::funcs
diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu b/paddle/phi/kernels/funcs/matrix_inverse.cu
index c0ea7ad84c41b..f46dd714c9f55 100644
--- a/paddle/phi/kernels/funcs/matrix_inverse.cu
+++ b/paddle/phi/kernels/funcs/matrix_inverse.cu
@@ -131,6 +131,8 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
 
 template class MatrixInverseFunctor<GPUContext, float>;
 template class MatrixInverseFunctor<GPUContext, double>;
+template class MatrixInverseFunctor<GPUContext, phi::dtype::complex<float>>;
+template class MatrixInverseFunctor<GPUContext, phi::dtype::complex<double>>;
 
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/matrix_inverse.h b/paddle/phi/kernels/funcs/matrix_inverse.h
index f0cd265a54648..d45f7d8863a63 100644
--- a/paddle/phi/kernels/funcs/matrix_inverse.h
+++ b/paddle/phi/kernels/funcs/matrix_inverse.h
@@ -25,14 +25,69 @@ limitations under the License. */
 namespace phi {
 namespace funcs {
 
+template <typename Context, typename T>
+struct MapMatrixInverseFunctor {
+  void operator()(
+      const Context& dev_ctx, const T* a_ptr, T* a_inv_ptr, int offset, int n) {
+    using Matrix =
+        Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+    using EigenMatrixMap = Eigen::Map<Matrix>;
+    using ConstEigenMatrixMap = Eigen::Map<const Matrix>;
+
+    ConstEigenMatrixMap mat(a_ptr + offset, n, n);
+    EigenMatrixMap mat_inv(a_inv_ptr + offset, n, n);
+    Eigen::PartialPivLU<Matrix> lu;
+    lu.compute(mat);
+
+    const T min_abs_pivot = lu.matrixLU().diagonal().cwiseAbs().minCoeff();
+    PADDLE_ENFORCE_GT(min_abs_pivot,
+                      static_cast<T>(0),
+                      errors::InvalidArgument("Input is not invertible."));
+    mat_inv.noalias() = lu.inverse();
+  }
+};
+
+template <typename Context, typename T>
+struct MapMatrixInverseFunctor<Context, phi::dtype::complex<T>> {
+  void operator()(const Context& dev_ctx,
+                  const phi::dtype::complex<T>* a_ptr,
+                  phi::dtype::complex<T>* a_inv_ptr,
+                  int offset,
+                  int n) {
+    using Matrix = Eigen::Matrix<std::complex<T>,
+                                 Eigen::Dynamic,
+                                 Eigen::Dynamic,
+                                 Eigen::RowMajor>;
+    using EigenMatrixMap = Eigen::Map<Matrix>;
+    using ConstEigenMatrixMap = Eigen::Map<const Matrix>;
+    std::complex<T>* std_ptr = new std::complex<T>[n * n];
+    std::complex<T>* std_inv_ptr = new std::complex<T>[n * n];
+    for (int i = 0; i < n * n; i++) {
+      *(std_ptr + i) = static_cast<std::complex<T>>(*(a_ptr + offset + i));
+    }
+    ConstEigenMatrixMap mat(std_ptr, n, n);
+    EigenMatrixMap mat_inv(std_inv_ptr, n, n);
+    Eigen::PartialPivLU<Matrix> lu;
+    lu.compute(mat);
+
+    const T min_abs_pivot = lu.matrixLU().diagonal().cwiseAbs().minCoeff();
+    PADDLE_ENFORCE_NE(min_abs_pivot,
+                      static_cast<std::complex<T>>(0),
+                      errors::InvalidArgument("Input is not invertible."));
+    mat_inv.noalias() = lu.inverse();
+    for (int i = 0; i < n * n; i++) {
+      *(a_inv_ptr + offset + i) =
+          static_cast<phi::dtype::complex<T>>(*(std_inv_ptr + i));
+    }
+    delete[] std_ptr;
+    delete[] std_inv_ptr;
+  }
+};
+
 template <typename Context, typename T>
 void ComputeInverseEigen(const Context& dev_ctx,
                          const DenseTensor& a,
                          DenseTensor* a_inv) {
-  using Matrix =
-      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
-  using EigenMatrixMap = Eigen::Map<Matrix>;
-  using ConstEigenMatrixMap = Eigen::Map<const Matrix>;
   const auto& mat_dims = a.dims();
   const int rank = mat_dims.size();
   int n = mat_dims[rank - 1];
@@ -41,17 +96,13 @@ void ComputeInverseEigen(const Context& dev_ctx,
   const T* a_ptr = a.data<T>();
   T* a_inv_ptr = dev_ctx.template Alloc<T>(a_inv);
 
+  // Putting phi::dtype::complex into eigen::matrix has a problem,
+  // it's not going to get the right result,
+  // so we're going to convert it to std::complex and
+  // then we're going to put it into eigen::matrix.
   for (int i = 0; i < batch_size; ++i) {
-    ConstEigenMatrixMap mat(a_ptr + i * n * n, n, n);
-    EigenMatrixMap mat_inv(a_inv_ptr + i * n * n, n, n);
-    Eigen::PartialPivLU<Matrix> lu;
-    lu.compute(mat);
-
-    const T min_abs_pivot = lu.matrixLU().diagonal().cwiseAbs().minCoeff();
-    PADDLE_ENFORCE_GT(min_abs_pivot,
-                      static_cast<T>(0),
-                      errors::InvalidArgument("Input is not invertible."));
-    mat_inv.noalias() = lu.inverse();
+    MapMatrixInverseFunctor<Context, T> functor;
+    functor(dev_ctx, a_ptr, a_inv_ptr, i * n * n, n);
   }
 }
 
diff --git a/paddle/phi/kernels/funcs/pooling.cc b/paddle/phi/kernels/funcs/pooling.cc
index 99281f62cef37..3c93a8341d411 100644
--- a/paddle/phi/kernels/funcs/pooling.cc
+++ b/paddle/phi/kernels/funcs/pooling.cc
@@ -684,12 +684,16 @@ template class MaxPool2dGradFunctor<CPUContext, double>;
 
 template class Pool2dFunctor<CPUContext, MaxPool<float>, float>;
 template class Pool2dFunctor<CPUContext, AvgPool<float>, float>;
+template class Pool2dFunctor<CPUContext, LPPool<float>, float>;
 template class Pool2dGradFunctor<CPUContext, MaxPoolGrad<float>, float>;
 template class Pool2dGradFunctor<CPUContext, AvgPoolGrad<float>, float>;
+template class Pool2dGradFunctor<CPUContext, LPPoolGrad<float>, float>;
 template class Pool2dFunctor<CPUContext, MaxPool<double>, double>;
 template class Pool2dFunctor<CPUContext, AvgPool<double>, double>;
+template class Pool2dFunctor<CPUContext, LPPool<double>, double>;
 template class Pool2dGradFunctor<CPUContext, MaxPoolGrad<double>, double>;
 template class Pool2dGradFunctor<CPUContext, AvgPoolGrad<double>, double>;
+template class Pool2dGradFunctor<CPUContext, LPPoolGrad<double>, double>;
 
 /*
  * Tensors are in NCDHW or NDHWC format.
diff --git a/paddle/phi/kernels/funcs/pooling.cu b/paddle/phi/kernels/funcs/pooling.cu
index 3d69d11c4f839..62537d5488e23 100644
--- a/paddle/phi/kernels/funcs/pooling.cu
+++ b/paddle/phi/kernels/funcs/pooling.cu
@@ -1005,12 +1005,16 @@ template class MaxPool2dGradFunctor<phi::GPUContext, dtype::bfloat16>;
 
 template class Pool2dFunctor<phi::GPUContext, MaxPool<float>, float>;
 template class Pool2dFunctor<phi::GPUContext, AvgPool<float>, float>;
+template class Pool2dFunctor<phi::GPUContext, LPPool<float>, float>;
 template class Pool2dGradFunctor<phi::GPUContext, MaxPoolGrad<float>, float>;
 template class Pool2dGradFunctor<phi::GPUContext, AvgPoolGrad<float>, float>;
+template class Pool2dGradFunctor<phi::GPUContext, LPPoolGrad<float>, float>;
 template class Pool2dFunctor<phi::GPUContext, MaxPool<double>, double>;
 template class Pool2dFunctor<phi::GPUContext, AvgPool<double>, double>;
+template class Pool2dFunctor<phi::GPUContext, LPPool<double>, double>;
 template class Pool2dGradFunctor<phi::GPUContext, MaxPoolGrad<double>, double>;
 template class Pool2dGradFunctor<phi::GPUContext, AvgPoolGrad<double>, double>;
+template class Pool2dGradFunctor<phi::GPUContext, LPPoolGrad<double>, double>;
 
 template class Pool2dFunctor<phi::GPUContext,
                              MaxPool<dtype::float16>,
@@ -1018,24 +1022,36 @@ template class Pool2dFunctor<phi::GPUContext,
 template class Pool2dFunctor<phi::GPUContext,
                              AvgPool<dtype::float16>,
                              dtype::float16>;
+template class Pool2dFunctor<phi::GPUContext,
+                             LPPool<dtype::float16>,
+                             dtype::float16>;
 template class Pool2dGradFunctor<phi::GPUContext,
                                  MaxPoolGrad<dtype::float16>,
                                  dtype::float16>;
 template class Pool2dGradFunctor<phi::GPUContext,
                                  AvgPoolGrad<dtype::float16>,
                                  dtype::float16>;
+template class Pool2dGradFunctor<phi::GPUContext,
+                                 LPPoolGrad<dtype::float16>,
+                                 dtype::float16>;
 template class Pool2dFunctor<phi::GPUContext,
                              MaxPool<dtype::bfloat16>,
                              dtype::bfloat16>;
 template class Pool2dFunctor<phi::GPUContext,
                              AvgPool<dtype::bfloat16>,
                              dtype::bfloat16>;
+template class Pool2dFunctor<phi::GPUContext,
+                             LPPool<dtype::bfloat16>,
+                             dtype::bfloat16>;
 template class Pool2dGradFunctor<phi::GPUContext,
                                  MaxPoolGrad<dtype::bfloat16>,
                                  dtype::bfloat16>;
 template class Pool2dGradFunctor<phi::GPUContext,
                                  AvgPoolGrad<dtype::bfloat16>,
                                  dtype::bfloat16>;
+template class Pool2dGradFunctor<phi::GPUContext,
+                                 LPPoolGrad<dtype::bfloat16>,
+                                 dtype::bfloat16>;
 
 template <typename PoolProcess, typename T>
 __global__ void KernelPool3D(const int nthreads,
diff --git a/paddle/phi/kernels/funcs/pooling.h b/paddle/phi/kernels/funcs/pooling.h
index 3e91175e8a392..325116ce0cf7e 100644
--- a/paddle/phi/kernels/funcs/pooling.h
+++ b/paddle/phi/kernels/funcs/pooling.h
@@ -68,6 +68,27 @@ class AvgPool {
   }
 };
 
+template <class T>
+class LPPool {
+  using MT = typename dtype::MPTypeTrait<T>::Type;
+  MT intermediate_res;
+  float norm_type;
+
+ public:
+  HOSTDEVICE inline void setNormType(float ntype) { norm_type = ntype; }
+  DEVICE inline T initial() {
+    intermediate_res = static_cast<MT>(0.0f);
+    return static_cast<T>(0);
+  }
+  DEVICE inline void compute(const T& x, T* y UNUSED) {
+    intermediate_res += static_cast<MT>(powf(x, norm_type));
+  }
+
+  DEVICE inline void finalize(const T& pool_field UNUSED, T* y) {
+    *y = static_cast<T>(powf(intermediate_res, 1.0 / norm_type));
+  }
+};
+
 template <class T>
 class MaxPoolGrad {
  public:
@@ -88,6 +109,21 @@ class AvgPoolGrad {
   }
 };
 
+template <class T>
+class LPPoolGrad {
+  float norm_type;
+
+ public:
+  static constexpr bool use_x = true;
+  HOSTDEVICE inline void setNormType(float ntype) { norm_type = ntype; }
+  HOSTDEVICE inline void compute(
+      const T& x, const T& y, const T& dy, T scale UNUSED, T* dx) {
+    *dx += static_cast<T>(static_cast<double>(dy) *
+                          powf(static_cast<double>(x) / static_cast<double>(y),
+                               norm_type - 1.0f));
+  }
+};
+
 /* used for adaptive pool to calculate start and end index of each divided grid
  */
 HOSTDEVICE inline int AdaptStartIndex(int ph, int input_size, int output_size) {
diff --git a/paddle/phi/kernels/funcs/segment_pooling.cc b/paddle/phi/kernels/funcs/segment_pooling.cc
index 9af1211b9a144..31c67cace95d6 100644
--- a/paddle/phi/kernels/funcs/segment_pooling.cc
+++ b/paddle/phi/kernels/funcs/segment_pooling.cc
@@ -19,8 +19,7 @@ limitations under the License. */
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 
-namespace phi {
-namespace funcs {
+namespace phi::funcs {
 
 using Tensor = DenseTensor;
 
@@ -168,5 +167,4 @@ template class SegmentPoolGradFunctor<CPU, int64_t, int64_t>;
 template class SegmentPoolGradFunctor<CPU, float16, int>;
 template class SegmentPoolGradFunctor<CPU, float16, int64_t>;
 
-}  // namespace funcs
-}  // namespace phi
+}  // namespace phi::funcs
diff --git a/paddle/phi/kernels/funcs/sequence2batch.cc b/paddle/phi/kernels/funcs/sequence2batch.cc
index 3e30bca02d8a4..924fb15c77218 100644
--- a/paddle/phi/kernels/funcs/sequence2batch.cc
+++ b/paddle/phi/kernels/funcs/sequence2batch.cc
@@ -14,8 +14,7 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/funcs/sequence2batch.h"
 
-namespace phi {
-namespace funcs {
+namespace phi::funcs {
 
 template <typename T>
 class CopyMatrixRowsFunctor<phi::CPUContext, T> {
@@ -76,5 +75,4 @@ template class LoDTensor2BatchFunctor<phi::CPUContext, double>;
 template class Batch2LoDTensorFunctor<phi::CPUContext, float>;
 template class Batch2LoDTensorFunctor<phi::CPUContext, double>;
 
-}  // namespace funcs
-}  // namespace phi
+}  // namespace phi::funcs
diff --git a/paddle/phi/kernels/funcs/sequence_pooling.cc b/paddle/phi/kernels/funcs/sequence_pooling.cc
index f4ee9c323366e..1fdaadfea01a1 100644
--- a/paddle/phi/kernels/funcs/sequence_pooling.cc
+++ b/paddle/phi/kernels/funcs/sequence_pooling.cc
@@ -21,8 +21,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/jit/kernels.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-namespace phi {
-namespace funcs {
+namespace phi::funcs {
 
 template <typename T,
           int MajorType = Eigen::RowMajor,
@@ -498,5 +497,4 @@ template class SequencePoolFunctor<phi::CPUContext, double>;
 template class SequencePoolGradFunctor<phi::CPUContext, float>;
 template class SequencePoolGradFunctor<phi::CPUContext, double>;
 
-}  // namespace funcs
-}  // namespace phi
+}  // namespace phi::funcs
diff --git a/paddle/phi/kernels/funcs/softmax.cc b/paddle/phi/kernels/funcs/softmax.cc
index 2d8dffc3aec6d..ce41590b84420 100644
--- a/paddle/phi/kernels/funcs/softmax.cc
+++ b/paddle/phi/kernels/funcs/softmax.cc
@@ -17,13 +17,11 @@ limitations under the License. */
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/kernels/funcs/softmax_impl.h"
 
-namespace phi {
-namespace funcs {
+namespace phi::funcs {
 
 template class SoftmaxFunctor<phi::CPUContext, float>;
 template class SoftmaxFunctor<phi::CPUContext, double>;
 template class SoftmaxGradFunctor<phi::CPUContext, float>;
 template class SoftmaxGradFunctor<phi::CPUContext, double>;
 
-}  // namespace funcs
-}  // namespace phi
+}  // namespace phi::funcs
diff --git a/paddle/phi/kernels/fusion/cpu/fused_elemwise_activation_grad_kernel.cc b/paddle/phi/kernels/fusion/cpu/fused_elemwise_activation_grad_kernel.cc
new file mode 100644
index 0000000000000..818722a224867
--- /dev/null
+++ b/paddle/phi/kernels/fusion/cpu/fused_elemwise_activation_grad_kernel.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/fused_elemwise_activation_kernel_impl.h"
+PD_REGISTER_KERNEL(fused_elemwise_activation_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::FusedElemwiseActivationGradKernel,
+                   float,
+                   double) {}
+
+PD_REGISTER_KERNEL(fused_elemwise_add_activation_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::FusedElemwiseAddActivationGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/fusion/cpu/fused_elemwise_activation_kernel.cc b/paddle/phi/kernels/fusion/cpu/fused_elemwise_activation_kernel.cc
new file mode 100644
index 0000000000000..9eb7668fb054e
--- /dev/null
+++ b/paddle/phi/kernels/fusion/cpu/fused_elemwise_activation_kernel.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/fused_elemwise_activation_kernel_impl.h"
+PD_REGISTER_KERNEL(fused_elemwise_activation,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::FusedElemwiseActivationKernel,
+                   float,
+                   double) {}
+
+PD_REGISTER_KERNEL(fused_elemwise_add_activation,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::FusedElemwiseAddActivationKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/fusion/cpu/fused_softmax_mask_kernel.cc b/paddle/phi/kernels/fusion/cpu/fused_softmax_mask_kernel.cc
index 851affbb21f0f..536b127b5bd71 100644
--- a/paddle/phi/kernels/fusion/cpu/fused_softmax_mask_kernel.cc
+++ b/paddle/phi/kernels/fusion/cpu/fused_softmax_mask_kernel.cc
@@ -16,8 +16,7 @@
 #include "paddle/phi/kernels/elementwise_add_kernel.h"
 #include "paddle/phi/kernels/softmax_kernel.h"
 
-namespace phi {
-namespace fusion {
+namespace phi::fusion {
 
 template <typename T, typename Context>
 void FusedSoftmaxMaskKernel(const Context& dev_ctx,
@@ -57,8 +56,7 @@ void FusedSoftmaxMaskKernel(const Context& dev_ctx,
   SoftmaxKernel<T, Context>(dev_ctx, t, 3, out);  // axis for softmax
 }
 
-}  // namespace fusion
-}  // namespace phi
+}  // namespace phi::fusion
 
 PD_REGISTER_KERNEL(fused_softmax_mask,
                    CPU,
diff --git a/paddle/phi/kernels/fusion/cpu/fusion_lstm_kernel.cc b/paddle/phi/kernels/fusion/cpu/fusion_lstm_kernel.cc
new file mode 100644
index 0000000000000..522d7b77b559c
--- /dev/null
+++ b/paddle/phi/kernels/fusion/cpu/fusion_lstm_kernel.cc
@@ -0,0 +1,443 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/fc_functor.h"
+#include "paddle/phi/kernels/funcs/jit/kernels.h"
+#include "paddle/phi/kernels/funcs/sequence2batch.h"
+
+namespace phi {
+
+#define INIT_BASE_DEFINES                \
+  auto *x = &x_in;                       \
+  auto *h0 = h0_in.get_ptr();            \
+  auto *c0 = c0_in.get_ptr();            \
+  auto *wx = &weight_x_in;               \
+  auto *wh = &weight_h_in;               \
+  auto *bias = &bias_in;                 \
+  auto *hidden_out = hidden;             \
+  auto *cell_out = cell;                 \
+  auto x_dims = x->dims();   /* T x M*/  \
+  auto wh_dims = wh->dims(); /* D x 4D*/ \
+  const int M = x_dims[1];               \
+  const int D = wh_dims[0];              \
+  const int D4 = wh_dims[1]
+
+#define INIT_OTHER_DEFINES                                             \
+  const T *x_data = x->data<T>();                                      \
+  const T *wx_data = wx->data<T>();                                    \
+  const T *wh_data = wh->data<T>();                                    \
+  /* diagonal weight*/                                                 \
+  const T *wp_data = bias->data<T>() + D4;                             \
+  /* for peephole only*/                                               \
+  T *checked_cell_data = nullptr;                                      \
+  if (use_peepholes) {                                                 \
+    /* w_ic * Ct-1, w_fc * Ct-1  ; w_oc * Ct => ih*/                   \
+    checked_cell_data = dev_ctx.template Alloc<T>(checked_cell);       \
+  }                                                                    \
+  const phi::jit::lstm_attr_t attr(                                    \
+      D,                                                               \
+      phi::jit::to_kerneltype(gate_activation),                        \
+      phi::jit::to_kerneltype(candidate_activation),                   \
+      phi::jit::to_kerneltype(cell_activation),                        \
+      use_peepholes);                                                  \
+  phi::jit::lstm_t one_step;                                           \
+  one_step.wp = wp_data;                                               \
+  one_step.checked = checked_cell_data;                                \
+  auto ComputeC1H1 = phi::jit::KernelFuncs<phi::jit::LSTMC1H1Tuple<T>, \
+                                           phi::CPUPlace>::Cache()     \
+                         .At(attr);                                    \
+  auto ComputeCtHt = phi::jit::KernelFuncs<phi::jit::LSTMCtHtTuple<T>, \
+                                           phi::CPUPlace>::Cache()     \
+                         .At(attr)
+
+// Wh GEMM
+#define GEMM_WH_ADDON(bs, prev, out) \
+  blas.GEMM(CblasNoTrans,            \
+            CblasNoTrans,            \
+            bs,                      \
+            D4,                      \
+            D,                       \
+            static_cast<T>(1),       \
+            prev,                    \
+            D,                       \
+            wh_data,                 \
+            D4,                      \
+            static_cast<T>(1),       \
+            out,                     \
+            D4)
+
+template <typename T, typename Context>
+void SeqCompute(const Context &dev_ctx,
+                const DenseTensor &x_in,
+                const DenseTensor &weight_x_in,
+                const DenseTensor &weight_h_in,
+                const DenseTensor &bias_in,
+                const paddle::optional<DenseTensor> &h0_in,
+                const paddle::optional<DenseTensor> &c0_in,
+                bool use_peepholes,
+                bool is_reverse,
+                bool use_seq,
+                const std::string &gate_activation,
+                const std::string &cell_activation,
+                const std::string &candidate_activation,
+                float scale_data,
+                float shift_data,
+                const std::vector<float> &scale_weights,
+                bool force_fp32_output,
+                DenseTensor *hidden,
+                DenseTensor *cell,
+                DenseTensor *xx,
+                DenseTensor *batched_input,
+                DenseTensor *batched_hidden,
+                DenseTensor *batched_cell,
+                DenseTensor *reordered_h0,
+                DenseTensor *reordered_c0,
+                DenseTensor *checked_cell) {
+  INIT_BASE_DEFINES;
+  INIT_OTHER_DEFINES;
+  auto x_lod = x->lod();
+  const int total_T = static_cast<int>(x_dims[0]);
+  const int N = static_cast<int>(x_lod[0].size() - 1);
+  const T *h0_data = h0 ? h0->data<T>() : nullptr;
+  const T *c0_data = c0 ? c0->data<T>() : nullptr;
+  T *xx_data = dev_ctx.template Alloc<T>(xx);
+  T *h_out_data = dev_ctx.template Alloc<T>(hidden_out);
+  T *c_out_data = dev_ctx.template Alloc<T>(cell_out);
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+
+  phi::funcs::FCFunctor<Context, T> fc;
+  fc(dev_ctx, total_T, D4, M, x_data, wx_data, xx_data, bias->data<T>());
+
+  int xx_offset = D4;
+  int gate_offset = D;
+  if (is_reverse) {
+    const int offset = (total_T - 1) * D;
+    xx_data = xx_data + offset * 4;
+    h_out_data = h_out_data + offset;
+    c_out_data = c_out_data + offset;
+    xx_offset = -D4;
+    gate_offset = -D;
+  }
+
+  for (int i = 0; i < N; ++i) {
+    int bid = is_reverse ? N - 1 - i : i;
+    int seq_len = static_cast<int>(x_lod[0][bid + 1] - x_lod[0][bid]);
+    const T *prev_c_data = nullptr;
+    const T *prev_h_data = nullptr;
+    int tstart = 0;
+    if (h0_data) {
+      prev_h_data = h0_data + bid * D;
+      prev_c_data = c0_data + bid * D;
+    } else {
+      one_step.gates = xx_data;
+      one_step.ct = c_out_data;
+      one_step.ht = h_out_data;
+      ComputeC1H1(&one_step, &attr);
+      tstart = 1;
+      // move one step
+      prev_h_data = h_out_data;
+      prev_c_data = c_out_data;
+      xx_data = xx_data + xx_offset;
+      h_out_data = h_out_data + gate_offset;
+      c_out_data = c_out_data + gate_offset;
+    }
+    for (int step = tstart; step < seq_len; ++step) {
+      GEMM_WH_ADDON(1, prev_h_data, xx_data);
+
+      one_step.gates = xx_data;
+      one_step.ct_1 = prev_c_data;
+      one_step.ct = c_out_data;
+      one_step.ht = h_out_data;
+      ComputeCtHt(&one_step, &attr);
+      // move one step
+      prev_h_data = h_out_data;
+      prev_c_data = c_out_data;
+      xx_data = xx_data + xx_offset;
+      h_out_data = h_out_data + gate_offset;
+      c_out_data = c_out_data + gate_offset;
+    }
+  }
+}
+
+template <typename T, typename Context>
+void BatchCompute(const Context &dev_ctx,
+                  const DenseTensor &x_in,
+                  const DenseTensor &weight_x_in,
+                  const DenseTensor &weight_h_in,
+                  const DenseTensor &bias_in,
+                  const paddle::optional<DenseTensor> &h0_in,
+                  const paddle::optional<DenseTensor> &c0_in,
+                  bool use_peepholes,
+                  bool is_reverse,
+                  bool use_seq,
+                  const std::string &gate_activation,
+                  const std::string &cell_activation,
+                  const std::string &candidate_activation,
+                  float scale_data,
+                  float shift_data,
+                  const std::vector<float> &scale_weights,
+                  bool force_fp32_output,
+                  DenseTensor *hidden,
+                  DenseTensor *cell,
+                  DenseTensor *xx,
+                  DenseTensor *batched_input,
+                  DenseTensor *batched_hidden,
+                  DenseTensor *batched_cell,
+                  DenseTensor *reordered_h0,
+                  DenseTensor *reordered_c0,
+                  DenseTensor *checked_cell) {
+  INIT_BASE_DEFINES;
+  if (x->lod()[0].size() == 2) {
+    xx->Resize({x_dims[0], D4});
+    SeqCompute<T, Context>(dev_ctx,
+                           x_in,
+                           weight_x_in,
+                           weight_h_in,
+                           bias_in,
+                           h0_in,
+                           c0_in,
+                           use_peepholes,
+                           is_reverse,
+                           use_seq,
+                           gate_activation,
+                           cell_activation,
+                           candidate_activation,
+                           scale_data,
+                           shift_data,
+                           scale_weights,
+                           force_fp32_output,
+                           hidden,
+                           cell,
+                           xx,
+                           batched_input,
+                           batched_hidden,
+                           batched_cell,
+                           reordered_h0,
+                           reordered_c0,
+                           checked_cell);
+    return;
+  }
+  INIT_OTHER_DEFINES;
+
+  auto *batched_c_out = batched_cell;
+  auto *batched_h_out = batched_hidden;
+  T *xx_data = dev_ctx.template Alloc<T>(xx);
+  T *batched_input_data = dev_ctx.template Alloc<T>(batched_input);
+  T *batched_c_out_data = dev_ctx.template Alloc<T>(batched_c_out);
+  T *batched_h_out_data = dev_ctx.template Alloc<T>(batched_h_out);
+  dev_ctx.template Alloc<T>(hidden_out);
+  dev_ctx.template Alloc<T>(cell_out);
+
+  phi::funcs::LoDTensor2BatchFunctor<Context, T> to_batch;
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+  phi::funcs::FCFunctor<Context, T> fc;
+  if (M > D4) {
+    fc(dev_ctx, x_dims[0], D4, M, x_data, wx_data, xx_data, bias->data<T>());
+    to_batch(dev_ctx, *xx, batched_input, true, is_reverse);
+  } else {
+    to_batch(dev_ctx, *x, xx, true, is_reverse);
+    batched_input->set_lod(xx->lod());
+    fc(dev_ctx,
+       x_dims[0],
+       D4,
+       M,
+       xx_data,
+       wx_data,
+       batched_input_data,
+       bias->data<T>());
+  }
+
+  auto batched_lod = batched_input->lod();
+  const auto &seq_order = batched_lod[2];
+  const int max_bs = static_cast<int>(seq_order.size());
+  reordered_h0->Resize({max_bs, D});
+  reordered_c0->Resize({max_bs, D});
+
+  int tstart = 0;
+  T *prev_h_data = nullptr;
+  T *prev_c_data = nullptr;
+  if (h0) {
+    // reorder h0, c0
+    T *reordered_h0_data = dev_ctx.template Alloc<T>(reordered_h0);
+    T *reordered_c0_data = dev_ctx.template Alloc<T>(reordered_c0);
+    const T *h0_data = h0->data<T>();
+    const T *c0_data = c0->data<T>();
+    prev_h_data = reordered_h0_data;
+    prev_c_data = reordered_c0_data;
+    size_t sz = D;
+    for (int i = 0; i < max_bs; ++i) {
+      blas.VCOPY(sz, h0_data + seq_order[i] * D, reordered_h0_data);
+      blas.VCOPY(sz, c0_data + seq_order[i] * D, reordered_c0_data);
+      reordered_h0_data += D;
+      reordered_c0_data += D;
+    }
+  } else {
+    // compute without h0, c0
+    T *cur_in_data = batched_input_data;
+    T *cur_h_out_data = batched_h_out_data;
+    T *cur_c_out_data = batched_c_out_data;
+    for (int i = 0; i < max_bs; ++i) {
+      one_step.gates = cur_in_data;
+      one_step.ct = cur_c_out_data;
+      one_step.ht = cur_h_out_data;
+      ComputeC1H1(&one_step, &attr);
+
+      cur_in_data += D4;
+      cur_c_out_data += D;
+      cur_h_out_data += D;
+    }
+    tstart = 1;
+    prev_h_data = batched_h_out_data;
+    prev_c_data = batched_c_out_data;
+  }
+
+  // compute kernel part
+  const auto &batch_starts = batched_lod[0];
+  const int max_seq_len = static_cast<int>(batch_starts.size() - 1);
+  const int offset = tstart * max_bs * D;
+  batched_input_data = batched_input_data + offset * 4;
+  batched_h_out_data = batched_h_out_data + offset;
+  batched_c_out_data = batched_c_out_data + offset;
+  for (int step = tstart; step < max_seq_len; ++step) {
+    const int cur_bs =
+        static_cast<int>(batch_starts[step + 1] - batch_starts[step]);
+    GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data);
+    T *cur_in_data = batched_input_data;
+    T *cur_prev_c_data = prev_c_data;
+    T *cur_c_out_data = batched_c_out_data;
+    T *cur_h_out_data = batched_h_out_data;
+    for (int i = 0; i < cur_bs; ++i) {
+      one_step.gates = cur_in_data;
+      one_step.ct_1 = cur_prev_c_data;
+      one_step.ct = cur_c_out_data;
+      one_step.ht = cur_h_out_data;
+      ComputeCtHt(&one_step, &attr);
+
+      // move one batch
+      cur_in_data += D4;
+      cur_prev_c_data += D;
+      cur_c_out_data += D;
+      cur_h_out_data += D;
+    }
+    // move one step
+    prev_c_data = batched_c_out_data;
+    prev_h_data = batched_h_out_data;
+    batched_c_out_data = cur_c_out_data;
+    batched_h_out_data = cur_h_out_data;
+    batched_input_data = cur_in_data;
+  }
+
+  phi::funcs::Batch2LoDTensorFunctor<Context, T> to_seq;
+  batched_h_out->set_lod(batched_lod);
+  to_seq(dev_ctx, *batched_h_out, hidden_out);
+  batched_c_out->set_lod(batched_lod);
+  to_seq(dev_ctx, *batched_c_out, cell_out);
+}
+
+template <typename T, typename Context>
+void FusionLSTMKernel(const Context &dev_ctx,
+                      const DenseTensor &x_in,
+                      const DenseTensor &weight_x_in,
+                      const DenseTensor &weight_h_in,
+                      const DenseTensor &bias_in,
+                      const paddle::optional<DenseTensor> &h0_in,
+                      const paddle::optional<DenseTensor> &c0_in,
+                      bool use_peepholes,
+                      bool is_reverse,
+                      bool use_seq,
+                      const std::string &gate_activation,
+                      const std::string &cell_activation,
+                      const std::string &candidate_activation,
+                      float scale_data,
+                      float shift_data,
+                      const std::vector<float> &scale_weights,
+                      bool force_fp32_output,
+                      DenseTensor *hidden,
+                      DenseTensor *cell,
+                      DenseTensor *xx,
+                      DenseTensor *batched_input,
+                      DenseTensor *batched_hidden,
+                      DenseTensor *batched_cell,
+                      DenseTensor *reordered_h0,
+                      DenseTensor *reordered_c0,
+                      DenseTensor *checked_cell) {
+  if (use_seq) {
+    SeqCompute<T, Context>(dev_ctx,
+                           x_in,
+                           weight_x_in,
+                           weight_h_in,
+                           bias_in,
+                           h0_in,
+                           c0_in,
+                           use_peepholes,
+                           is_reverse,
+                           use_seq,
+                           gate_activation,
+                           cell_activation,
+                           candidate_activation,
+                           scale_data,
+                           shift_data,
+                           scale_weights,
+                           force_fp32_output,
+                           hidden,
+                           cell,
+                           xx,
+                           batched_input,
+                           batched_hidden,
+                           batched_cell,
+                           reordered_h0,
+                           reordered_c0,
+                           checked_cell);
+  } else {
+    BatchCompute<T, Context>(dev_ctx,
+                             x_in,
+                             weight_x_in,
+                             weight_h_in,
+                             bias_in,
+                             h0_in,
+                             c0_in,
+                             use_peepholes,
+                             is_reverse,
+                             use_seq,
+                             gate_activation,
+                             cell_activation,
+                             candidate_activation,
+                             scale_data,
+                             shift_data,
+                             scale_weights,
+                             force_fp32_output,
+                             hidden,
+                             cell,
+                             xx,
+                             batched_input,
+                             batched_hidden,
+                             batched_cell,
+                             reordered_h0,
+                             reordered_c0,
+                             checked_cell);
+  }
+}
+
+#undef GEMM_WH_ADDON
+#undef INIT_OTHER_DEFINES
+#undef INIT_BASE_DEFINES
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    fusion_lstm, CPU, ALL_LAYOUT, phi::FusionLSTMKernel, float, double) {}
diff --git a/paddle/phi/kernels/fusion/cpu/fusion_seqconv_eltadd_relu_kernel.cc b/paddle/phi/kernels/fusion/cpu/fusion_seqconv_eltadd_relu_kernel.cc
index 4ff18849316d8..456d3370990cb 100644
--- a/paddle/phi/kernels/fusion/cpu/fusion_seqconv_eltadd_relu_kernel.cc
+++ b/paddle/phi/kernels/fusion/cpu/fusion_seqconv_eltadd_relu_kernel.cc
@@ -23,8 +23,7 @@
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/fc_functor.h"
 
-namespace phi {
-namespace fusion {
+namespace phi::fusion {
 
 template <typename T, typename Context>
 void FusionSeqConvEltAddReluKernel(const Context& dev_ctx,
@@ -148,8 +147,7 @@ void FusionSeqConvEltAddReluKernel(const Context& dev_ctx,
      true);
 }
 
-}  // namespace fusion
-}  // namespace phi
+}  // namespace phi::fusion
 
 PD_REGISTER_KERNEL(fusion_seqconv_eltadd_relu,
                    CPU,
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/CMakeLists.txt b/paddle/phi/kernels/fusion/cutlass/conv2d/CMakeLists.txt
index d760ce773c135..abcf220aa5c54 100644
--- a/paddle/phi/kernels/fusion/cutlass/conv2d/CMakeLists.txt
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.23)
+cmake_minimum_required(VERSION 3.18)
 
 if(NOT DEFINED PYTHON_EXECUTABLE)
   message(
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/compile.sh b/paddle/phi/kernels/fusion/cutlass/conv2d/compile.sh
index eb13c7dd6723d..b945df846ffe4 100644
--- a/paddle/phi/kernels/fusion/cutlass/conv2d/compile.sh
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/compile.sh
@@ -1,11 +1,11 @@
 # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -13,21 +13,38 @@
 # limitations under the License.
 set -e
 
-cutlass_repo_directory="cutlass"
-if [ ! -d "$cutlass_repo_directory" ]; then
-     git clone --branch v3.0.0  https://github.com/NVIDIA/cutlass
-fi
-
 build_directory="build"
 if [ ! -d "$build_directory" ]; then
     mkdir $build_directory
 fi
 
-python_exe_path="python"
-cuda_root_path="/usr/local/cuda"
-gpu_cc="80"
+libname="$build_directory/libCutlassConv2d.so"
+if [ -e "$libname" ]; then
+    exit 0
+fi
+
+default_python_exe_path="/usr/bin/python"
+default_cuda_root_path="/usr/local/cuda"
+default_gpu_cc="80"
+default_cmake_command="cmake"
+
+python_exe_path="${1:-$default_python_exe_path}"
+cuda_root_path="${2:-$default_cuda_root_path}"
+gpu_cc="${3:-$default_gpu_cc}"
+cmake_command="${4:-$default_cmake_command}"
+
+case "$gpu_cc" in
+    75|80|86|89)  ;;
+    *)  exit 0  ;;
+esac
+
+cutlass_repo_directory="cutlass"
+if [ ! -d "$cutlass_repo_directory" ]; then
+    git clone --branch v3.0.0  https://github.com/NVIDIA/cutlass
+fi
+
 
 cd $build_directory
-cmake .. -DPYTHON_EXECUTABLE=$python_exe_path -DCUDA_TOOLKIT_ROOT_DIR=$cuda_root_path -DCOMPUTE_CAPABILITY=$gpu_cc
-make -j 
+$cmake_command .. -DPYTHON_EXECUTABLE=$python_exe_path -DCUDA_TOOLKIT_ROOT_DIR=$cuda_root_path -DCOMPUTE_CAPABILITY=$gpu_cc
+make -j8
 cd -
diff --git a/paddle/phi/kernels/fusion/cutlass/gemm_epilogue/CMakeLists.txt b/paddle/phi/kernels/fusion/cutlass/gemm_epilogue/CMakeLists.txt
index 6ad5035e9dcd6..fc9cfa1cfd919 100644
--- a/paddle/phi/kernels/fusion/cutlass/gemm_epilogue/CMakeLists.txt
+++ b/paddle/phi/kernels/fusion/cutlass/gemm_epilogue/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.23)
+cmake_minimum_required(VERSION 3.18)
 
 if(NOT DEFINED PYTHON_EXECUTABLE)
   message(
diff --git a/paddle/phi/kernels/fusion/cutlass/gemm_epilogue/compile.sh b/paddle/phi/kernels/fusion/cutlass/gemm_epilogue/compile.sh
index f8a5463239a95..07736079b72a3 100644
--- a/paddle/phi/kernels/fusion/cutlass/gemm_epilogue/compile.sh
+++ b/paddle/phi/kernels/fusion/cutlass/gemm_epilogue/compile.sh
@@ -1,11 +1,11 @@
 # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -13,21 +13,38 @@
 # limitations under the License.
 set -e
 
-cutlass_repo_directory="cutlass"
-if [ ! -d "$cutlass_repo_directory" ]; then
-    git clone --branch v2.11.0  https://github.com/NVIDIA/cutlass
-fi
-
 build_directory="build"
 if [ ! -d "$build_directory" ]; then
     mkdir $build_directory
 fi
 
-python_exe_path="/usr/bin/python"
-cuda_root_path="/usr/local/cuda"
-gpu_cc="80"
+libname="$build_directory/libCutlassGemmEpilogue.so"
+if [ -e "$libname" ]; then
+    exit 0
+fi
+
+default_python_exe_path="/usr/bin/python"
+default_cuda_root_path="/usr/local/cuda"
+default_gpu_cc="80"
+default_cmake_command="cmake"
+
+python_exe_path="${1:-$default_python_exe_path}"
+cuda_root_path="${2:-$default_cuda_root_path}"
+gpu_cc="${3:-$default_gpu_cc}"
+cmake_command="${4:-$default_cmake_command}"
+
+case "$gpu_cc" in
+    80|86|89)  ;;
+    *)  exit 0  ;;
+esac
+
+cutlass_repo_directory="cutlass"
+if [ ! -d "$cutlass_repo_directory" ]; then
+    git clone --branch v2.11.0  https://github.com/NVIDIA/cutlass
+fi
+
 
 cd $build_directory
-cmake .. -DPYTHON_EXECUTABLE=$python_exe_path -DCUDA_TOOLKIT_ROOT_DIR=$cuda_root_path -DCOMPUTE_CAPABILITY=$gpu_cc
-make -j 
+$cmake_command .. -DPYTHON_EXECUTABLE=$python_exe_path -DCUDA_TOOLKIT_ROOT_DIR=$cuda_root_path -DCOMPUTE_CAPABILITY=$gpu_cc
+make -j8
 cd -
diff --git a/paddle/phi/kernels/fusion/cutlass/gemm_epilogue/gemm_epilogue_util.h b/paddle/phi/kernels/fusion/cutlass/gemm_epilogue/gemm_epilogue_util.h
index 8f1be5983f646..8b36a43fdf843 100644
--- a/paddle/phi/kernels/fusion/cutlass/gemm_epilogue/gemm_epilogue_util.h
+++ b/paddle/phi/kernels/fusion/cutlass/gemm_epilogue/gemm_epilogue_util.h
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #pragma once
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
 #include <vector>
 
 #include "paddle/phi/kernels/fusion/cutlass/gemm_epilogue/gemm_epilogue_decl.h"
diff --git a/paddle/phi/kernels/fusion/cutlass/gemm_epilogue_kernel.cu b/paddle/phi/kernels/fusion/cutlass/gemm_epilogue_kernel.cu
index 9b18bbe0e9220..c552f1a00d763 100644
--- a/paddle/phi/kernels/fusion/cutlass/gemm_epilogue_kernel.cu
+++ b/paddle/phi/kernels/fusion/cutlass/gemm_epilogue_kernel.cu
@@ -204,6 +204,5 @@ PD_REGISTER_KERNEL(gemm_epilogue,
                    GPU,
                    ALL_LAYOUT,
                    phi::fusion::cutlass_internal::GemmEpilogueKernel,
-                   float,
                    phi::dtype::bfloat16,
                    phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/fusion/gpu/fused_elemwise_activation_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_elemwise_activation_grad_kernel.cu
new file mode 100644
index 0000000000000..456fa415e4873
--- /dev/null
+++ b/paddle/phi/kernels/fusion/gpu/fused_elemwise_activation_grad_kernel.cu
@@ -0,0 +1,33 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/fused_elemwise_activation_kernel_impl.h"
+
+PD_REGISTER_KERNEL(fused_elemwise_activation_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::FusedElemwiseActivationGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+
+PD_REGISTER_KERNEL(fused_elemwise_add_activation_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::FusedElemwiseAddActivationGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/fusion/gpu/fused_elemwise_activation_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_elemwise_activation_kernel.cu
new file mode 100644
index 0000000000000..8bd925bbe0264
--- /dev/null
+++ b/paddle/phi/kernels/fusion/gpu/fused_elemwise_activation_kernel.cu
@@ -0,0 +1,33 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/fused_elemwise_activation_kernel_impl.h"
+
+PD_REGISTER_KERNEL(fused_elemwise_activation,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::FusedElemwiseActivationKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+
+PD_REGISTER_KERNEL(fused_elemwise_add_activation,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::FusedElemwiseAddActivationKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/fusion/onednn/fused_elementwise_kernel.cc b/paddle/phi/kernels/fusion/onednn/fused_elementwise_kernel.cc
index c46d7e77c8420..4f3da493fb4e7 100644
--- a/paddle/phi/kernels/fusion/onednn/fused_elementwise_kernel.cc
+++ b/paddle/phi/kernels/fusion/onednn/fused_elementwise_kernel.cc
@@ -15,8 +15,7 @@
 #include "paddle/phi/backends/onednn/onednn_reuse.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-namespace phi {
-namespace fusion {
+namespace phi::fusion {
 
 template <typename T, dnnl::algorithm BINARY_OP>
 void FusedElementwiseKernel(const OneDNNContext& dev_ctx,
@@ -177,8 +176,7 @@ DEFINE_ONEDNN_ELEMENTWISE_KERNEL(FusedSubtract, dnnl::algorithm::binary_sub)
 DEFINE_ONEDNN_ELEMENTWISE_KERNEL(FusedMultiply, dnnl::algorithm::binary_mul)
 DEFINE_ONEDNN_ELEMENTWISE_KERNEL(FusedDivide, dnnl::algorithm::binary_div)
 
-}  // namespace fusion
-}  // namespace phi
+}  // namespace phi::fusion
 
 PD_REGISTER_KERNEL(fused_elementwise_add,
                    OneDNN,
diff --git a/paddle/phi/kernels/fusion/onednn/fused_softplus_kernel.cc b/paddle/phi/kernels/fusion/onednn/fused_softplus_kernel.cc
index 56e9a93c59dd2..cf557e7087f4b 100644
--- a/paddle/phi/kernels/fusion/onednn/fused_softplus_kernel.cc
+++ b/paddle/phi/kernels/fusion/onednn/fused_softplus_kernel.cc
@@ -17,8 +17,7 @@
 #include "paddle/phi/backends/onednn/onednn_reuse.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-namespace phi {
-namespace fusion {
+namespace phi::fusion {
 
 template <typename T, typename Context>
 void FusedSoftplusKernel(const Context& dev_ctx,
@@ -56,8 +55,7 @@ void FusedSoftplusKernel(const Context& dev_ctx,
   out->set_mem_desc(dst_memory_p->get_desc());
 }
 
-}  // namespace fusion
-}  // namespace phi
+}  // namespace phi::fusion
 
 PD_REGISTER_KERNEL(fused_softplus,
                    OneDNN,
diff --git a/paddle/phi/kernels/fusion/onednn/fusion_lstm_kernel.cc b/paddle/phi/kernels/fusion/onednn/fusion_lstm_kernel.cc
new file mode 100644
index 0000000000000..02a3fd7fc3fb9
--- /dev/null
+++ b/paddle/phi/kernels/fusion/onednn/fusion_lstm_kernel.cc
@@ -0,0 +1,573 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/convert_utils.h"
+#include "paddle/phi/core/expect.h"
+#include "paddle/phi/kernels/fusion/onednn/fusion_rnn_onednn.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+namespace fusion {
+
+using phi::OneDNNContext;
+using phi::funcs::OneDNNGetDataType;
+using phi::funcs::OneDNNMemDesc;
+using phi::funcs::RNNReorderType;
+using OneDNNMemoryFormat = dnnl::memory::format_tag;
+
+template <typename T, typename T_out = T>
+class LSTMMKLDNNHandler
+    : public RNNMKLDNNHandler<T, dnnl::lstm_forward, T_out> {
+ public:
+  LSTMMKLDNNHandler(const OneDNNContext& dev_ctx,
+                    const dnnl::engine onednn_engine,
+                    phi::Place cpu_place UNUSED,
+                    const phi::DenseTensor* input,
+                    const phi::DenseTensor* weight_h,
+                    const phi::DenseTensor* h0,
+                    const phi::DenseTensor* c0 UNUSED,
+                    const bool is_reverse,
+                    const int64_t N,
+                    const int64_t Ti,
+                    const int64_t IC,
+                    const int64_t OC,
+                    const std::string& unique_name UNUSED,
+                    float scale_data,
+                    float shift_data,
+                    std::vector<float> scale_weights,
+                    bool use_peepholes,
+                    std::string gate_activation,
+                    std::string cell_activation,
+                    std::string candidate_activation)
+      : RNNMKLDNNHandler<T, dnnl::lstm_forward, T_out>(dev_ctx,
+                                                       onednn_engine,
+                                                       dev_ctx.GetPlace(),
+                                                       input,
+                                                       weight_h,
+                                                       h0,
+                                                       is_reverse,
+                                                       N,
+                                                       Ti,
+                                                       IC,
+                                                       OC,
+                                                       4,
+                                                       "x_weight_h",
+                                                       scale_data,
+                                                       shift_data,
+                                                       scale_weights) {
+    if (unlikely(!this->isCached())) {
+      const bool is_INT8 = std::is_same<T, uint8_t>::value;
+      // oneDNN kernel has hardcoded activation functions
+      PADDLE_ENFORCE_EQ(
+          gate_activation,
+          "sigmoid",
+          phi::errors::Unimplemented("oneDNN fusion_lstm supports only "
+                                     "sigmoid as a gate activation."));
+      PADDLE_ENFORCE_EQ(
+          cell_activation,
+          "tanh",
+          phi::errors::Unimplemented(
+              "oneDNN fusion_lstm supports only tanh as a cell activation."));
+      PADDLE_ENFORCE_EQ(
+          candidate_activation,
+          "tanh",
+          phi::errors::Unimplemented(
+              "oneDNN fusion_lstm supports only tanh a candidate activation."));
+
+      // Weights for int8 kernel are of a type s8
+      const auto weights_dt =
+          is_INT8 ? dnnl::memory::data_type::s8 : OneDNNGetDataType<T>();
+
+      // oneDNN RNN dimensions
+      const int64_t D = 1;  // Directions
+      const int64_t L = 1;  // Layers (PP supports only 1 stacked layer)
+      const int64_t G = 4;  // Number of Gates, 4 for LSTM
+
+      // Create memory descriptors
+      auto input_md = OneDNNMemDesc(
+          {Ti, N, IC}, OneDNNGetDataType<T>(), OneDNNMemoryFormat::tnc);
+      auto weight_x_md =
+          OneDNNMemDesc({L, D, IC, G, OC}, weights_dt, OneDNNMemoryFormat::any);
+      auto weight_h_md =
+          OneDNNMemDesc({L, D, OC, G, OC}, weights_dt, OneDNNMemoryFormat::any);
+      auto bias_md = OneDNNMemDesc(
+          {L, D, G, OC}, OneDNNGetDataType<float>(), OneDNNMemoryFormat::ldgo);
+      auto hidden_md = OneDNNMemDesc(
+          {Ti, N, OC}, OneDNNGetDataType<T_out>(), OneDNNMemoryFormat::any);
+
+      auto h0_md = OneDNNMemDesc(
+          {L, D, N, OC}, OneDNNGetDataType<T>(), OneDNNMemoryFormat::any);
+      auto c0_md = OneDNNMemDesc(
+          {L, D, N, OC}, OneDNNGetDataType<float>(), OneDNNMemoryFormat::any);
+
+      // Create LSTM oneDNN primitive
+      const auto direction =
+          is_reverse ? dnnl::rnn_direction::unidirectional_right2left
+                     : dnnl::rnn_direction::unidirectional_left2right;
+      if (!use_peepholes) {
+        this->AcquireForwardPrimitiveDescriptor(
+            this->attr_,
+            dnnl::prop_kind::forward_inference,
+            direction,
+            input_md,
+            h0_md,
+            c0_md,
+            weight_x_md,
+            weight_h_md,
+            bias_md,
+            hidden_md,
+            dnnl::memory::desc(),
+            dnnl::memory::desc());
+      } else {
+        auto weight_peephole_md = OneDNNMemDesc({L, D, 3, OC},
+                                                OneDNNGetDataType<float>(),
+                                                OneDNNMemoryFormat::ldgo);
+        this->AcquireForwardPrimitiveDescriptor(
+            this->attr_,
+            dnnl::prop_kind::forward_inference,
+            direction,
+            input_md,
+            h0_md,
+            c0_md,
+            weight_x_md,
+            weight_h_md,
+            weight_peephole_md,
+            bias_md,
+            hidden_md,
+            dnnl::memory::desc(),
+            dnnl::memory::desc());
+      }
+    }
+  }
+
+  // PaddlePaddle has different order of weights than oneDNN, so a reorder is
+  // needed
+  // PaddlePaddle:  {c, i, f, o}
+  // oneDNN:        {i, f, c, o}
+  template <typename U>
+  void ReorderGates(U* weights, int64_t I) {
+    size_t inner_block_size = this->OC;
+    size_t block_size = inner_block_size * this->G;
+    for (size_t i = 0; i < (size_t)I; ++i) {  // NOLINT
+      size_t offset = i * block_size;
+
+      U* base_pos = weights + offset;
+      std::swap_ranges(base_pos,
+                       base_pos + inner_block_size,
+                       base_pos + inner_block_size);  // c <-> i
+      std::swap_ranges(base_pos + inner_block_size,
+                       base_pos + 2 * inner_block_size,
+                       base_pos + 2 * inner_block_size);  // c <-> f
+    }
+  }
+
+  template <typename U>
+  std::shared_ptr<dnnl::memory> AcquireWeightXMemory(
+      const phi::DenseTensor* weight_x) {
+    const std::string wx_key = this->memory_key_ + "@weight_x";
+    auto memory_p =
+        std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(wx_key));
+
+    if (!memory_p) {
+      auto user_md = OneDNNMemDesc({1, 1, this->IC, this->G, this->OC},
+                                   OneDNNGetDataType<U>(),
+                                   OneDNNMemoryFormat::ldigo);
+      auto user_memory = dnnl::memory(user_md, this->engine_);
+
+      auto* weight_x_data = reinterpret_cast<U*>(user_memory.get_data_handle());
+      memcpy(weight_x_data,
+             weight_x->data<U>(),
+             sizeof(U) * this->IC * this->G * this->OC);
+
+      ReorderGates(weight_x_data, this->IC);
+
+      memory_p = std::make_shared<dnnl::memory>(
+          this->fwd_pd_->weights_layer_desc(), this->engine_);
+
+      auto& astream = OneDNNContext::tls().get_stream();
+      dnnl::reorder(user_memory, *memory_p, this->attr_)
+          .execute(astream, user_memory, *memory_p);
+
+      this->dev_ctx_.SetBlob(wx_key, memory_p);
+    }
+    return memory_p;
+  }
+
+  template <typename U>
+  std::shared_ptr<dnnl::memory> AcquireWeightHMemory(
+      const phi::DenseTensor* weight_h) {
+    const std::string wh_key = this->memory_key_ + "@weight_h";
+    auto memory_p =
+        std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(wh_key));
+
+    if (!memory_p) {
+      auto user_md = OneDNNMemDesc({1, 1, this->OC, this->G, this->OC},
+                                   OneDNNGetDataType<U>(),
+                                   OneDNNMemoryFormat::ldigo);
+      auto user_memory = dnnl::memory(user_md, this->engine_);
+
+      auto* weight_h_data = reinterpret_cast<U*>(user_memory.get_data_handle());
+      memcpy(weight_h_data,
+             weight_h->data<U>(),
+             sizeof(U) * this->OC * this->G * this->OC);
+
+      ReorderGates(weight_h_data, this->OC);
+
+      memory_p = std::make_shared<dnnl::memory>(
+          this->fwd_pd_->weights_iter_desc(), this->engine_);
+
+      auto& astream = OneDNNContext::tls().get_stream();
+      dnnl::reorder(user_memory, *memory_p, this->attr_)
+          .execute(astream, user_memory, *memory_p);
+
+      this->dev_ctx_.SetBlob(wh_key, memory_p);
+    }
+    return memory_p;
+  }
+
+  std::shared_ptr<dnnl::memory> AcquireBiasMemory(
+      const phi::DenseTensor* bias) {
+    const std::string bias_key = this->memory_key_ + "@bias";
+    auto memory_p = std::static_pointer_cast<dnnl::memory>(
+        this->dev_ctx_.GetBlob(bias_key));
+
+    if (!memory_p) {
+      memory_p = std::make_shared<dnnl::memory>(this->fwd_pd_->bias_desc(),
+                                                this->engine_);
+      auto* bias_data = reinterpret_cast<float*>(memory_p->get_data_handle());
+      if (bias) {
+        const float* user_bias_data =
+            bias->data<float>();  // Bias in oneDNN is always float
+
+        memcpy(bias_data, user_bias_data, sizeof(float) * this->G * this->OC);
+
+        ReorderGates(bias_data, 1);
+      } else {
+        // oneDNN always need bias memory, if it's not provided in PP, let
+        // oneDNN allocate memory and set it to 0
+        memset(bias_data, 0, sizeof(float) * this->G * this->OC);
+      }
+
+      this->dev_ctx_.SetBlob(bias_key, memory_p);
+    }
+    return memory_p;
+  }
+
+  std::shared_ptr<dnnl::memory> AcquirePeepholeWeights(
+      const phi::DenseTensor* bias) {
+    const std::string peepholes_key = this->memory_key_ + "@peepholes_weights";
+    auto memory_p = std::static_pointer_cast<dnnl::memory>(
+        this->dev_ctx_.GetBlob(peepholes_key));
+
+    if (!memory_p) {
+      auto user_md = OneDNNMemDesc({1, 1, 3, this->OC},
+                                   OneDNNGetDataType<float>(),
+                                   OneDNNMemoryFormat::ldgo);
+      auto user_memory = dnnl::memory(user_md, this->engine_);
+      memory_p = std::make_shared<dnnl::memory>(
+          this->fwd_pd_->weights_peephole_desc(), this->engine_);
+      auto* peephole_weights_data =
+          reinterpret_cast<float*>(memory_p->get_data_handle());
+
+      const float* user_bias_data =
+          bias->data<float>();  // Bias in oneDNN is always float
+      memcpy(peephole_weights_data,
+             user_bias_data + 4 * this->OC,
+             sizeof(float) * 3 * this->OC);
+
+      this->dev_ctx_.SetBlob(peepholes_key, memory_p);
+    }
+    return memory_p;
+  }
+
+  std::shared_ptr<dnnl::memory> AcquireC0Memory(const phi::DenseTensor* c0) {
+    const std::string c0_key = this->memory_key_ + "@c0";
+    auto memory_p =
+        std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(c0_key));
+
+    if (!memory_p) {
+      auto user_c0_memory = dnnl::memory();
+      if (c0) {
+        user_c0_memory =
+            dnnl::memory({{1, 1, this->N, this->OC},
+                          OneDNNGetDataType<float>(),
+                          OneDNNMemoryFormat::ldnc},
+                         this->engine_,
+                         phi::funcs::to_void_cast(c0->data<float>()));
+      } else {
+        user_c0_memory = dnnl::memory({{1, 1, this->N, this->OC},
+                                       OneDNNGetDataType<float>(),
+                                       OneDNNMemoryFormat::ldnc},
+                                      this->engine_);
+        memset(user_c0_memory.get_data_handle(),
+               0,
+               sizeof(float) * this->N * this->OC);
+      }
+      memory_p = std::make_shared<dnnl::memory>(
+          this->fwd_pd_->src_iter_c_desc(), this->engine_);
+
+      auto& astream = OneDNNContext::tls().get_stream();
+      dnnl::reorder(user_c0_memory, *memory_p)
+          .execute(astream, user_c0_memory, *memory_p);
+
+      this->dev_ctx_.SetBlob(c0_key, memory_p);
+    }
+    return memory_p;
+  }
+};
+
+template <typename T, typename Context, typename Tout = T>
+void RunKernel(const Context& dev_ctx,
+               const DenseTensor& x_in,
+               const DenseTensor& weight_x_in,
+               const DenseTensor& weight_h_in,
+               const DenseTensor& bias_in,
+               const paddle::optional<DenseTensor>& h0_in,
+               const paddle::optional<DenseTensor>& c0_in,
+               bool use_peepholes,
+               bool is_reverse,
+               bool use_seq,
+               const std::string& gate_activation,
+               const std::string& cell_activation,
+               const std::string& candidate_activation,
+               float scale_data,
+               float shift_data,
+               const std::vector<float>& scale_weights,
+               bool force_fp32_output,
+               DenseTensor* hidden,
+               DenseTensor* cell,
+               DenseTensor* xx,
+               DenseTensor* batched_input,
+               DenseTensor* batched_hidden,
+               DenseTensor* batched_cell,
+               DenseTensor* reordered_h0,
+               DenseTensor* reordered_c0,
+               DenseTensor* checked_cell) {
+  const auto& onednn_engine = dev_ctx.GetEngine();
+
+  // Get Tensors
+  const auto* input = &x_in;
+  const auto* h0 = h0_in.get_ptr();
+  const auto* c0 = c0_in.get_ptr();
+  const auto* weight_x = &weight_x_in;
+  const auto* weight_h = &weight_h_in;
+  const auto* bias = &bias_in;
+
+  auto x_dims = input->dims();
+  auto x_mat_dims = (x_dims.size() == 3 && x_dims[1] == 1)
+                        ? common::flatten_to_2d(x_dims, 1)
+                        : x_dims;
+
+  // Get tensor dimensions
+  const auto x_mat_dims_vec = common::vectorize(x_mat_dims);
+  const auto weight_h_dims = common::vectorize(weight_h->dims());
+  const auto& input_lod = input->lod()[0];
+
+  // Calculate RNN dimensions
+  const int64_t N = input_lod.size() - 1;  // Number of sentences (batches)
+  const int64_t Ti =  // Max length of the sentence in a batch
+      [&input_lod]() {
+        size_t res = 0;
+        for (size_t i = 0; i < (input_lod.size() - 1); ++i) {
+          res = std::max(res, input_lod[i + 1] - input_lod[i]);
+        }
+        return res;
+      }();
+  const int64_t IC = x_mat_dims_vec[1];  // Input channels
+  const int64_t OC = weight_h_dims[0];   // Output channels
+
+  LSTMMKLDNNHandler<T, Tout> handler(dev_ctx,
+                                     onednn_engine,
+                                     dev_ctx.GetPlace(),
+                                     input,
+                                     weight_h,
+                                     h0,
+                                     c0,
+                                     is_reverse,
+                                     N,
+                                     Ti,
+                                     IC,
+                                     OC,
+                                     "x_weight_h",
+                                     scale_data,
+                                     shift_data,
+                                     scale_weights,
+                                     use_peepholes,
+                                     gate_activation,
+                                     cell_activation,
+                                     candidate_activation);
+
+  auto input_memory_p =
+      handler.AcquireInputMemoryWithReorder(input, is_reverse);
+  auto c0_memory_p = handler.AcquireC0Memory(c0);
+
+  std::shared_ptr<dnnl::memory> h0_memory_p, weight_h_memory_p,
+      weight_x_memory_p;
+
+  if (weight_h->dtype() == phi::DataType::FLOAT32) {
+    h0_memory_p = handler.template AcquireH0Memory<float>(h0);
+    weight_x_memory_p = handler.template AcquireWeightXMemory<float>(weight_x);
+    weight_h_memory_p = handler.template AcquireWeightHMemory<float>(weight_h);
+  } else if (weight_h->dtype() == phi::DataType::BFLOAT16) {
+    h0_memory_p = handler.template AcquireH0Memory<phi::dtype::bfloat16>(h0);
+    weight_x_memory_p =
+        handler.template AcquireWeightXMemory<phi::dtype::bfloat16>(weight_x);
+    weight_h_memory_p =
+        handler.template AcquireWeightHMemory<phi::dtype::bfloat16>(weight_h);
+  } else {
+    h0_memory_p = handler.template AcquireH0Memory<uint8_t>(h0);
+    weight_x_memory_p = handler.template AcquireWeightXMemory<int8_t>(weight_x);
+    weight_h_memory_p = handler.template AcquireWeightHMemory<int8_t>(weight_h);
+  }
+
+  auto bias_memory_p = handler.AcquireBiasMemory(bias);
+  auto hidden_onednn_memory_p = handler.AcquireOutputMemory();
+
+  std::unordered_map<int, dnnl::memory> lstm_args = {
+      {DNNL_ARG_SRC_LAYER, *input_memory_p},
+      {DNNL_ARG_SRC_ITER, *h0_memory_p},
+      {DNNL_ARG_SRC_ITER_C, *c0_memory_p},
+      {DNNL_ARG_WEIGHTS_LAYER, *weight_x_memory_p},
+      {DNNL_ARG_WEIGHTS_ITER, *weight_h_memory_p},
+      {DNNL_ARG_BIAS, *bias_memory_p},
+      {DNNL_ARG_DST_LAYER, *hidden_onednn_memory_p}};
+
+  if (use_peepholes) {
+    auto peephole_weight_p = handler.AcquirePeepholeWeights(bias);
+    std::pair<int, dnnl::memory> peepholes_weights(DNNL_ARG_WEIGHTS_PEEPHOLE,
+                                                   *peephole_weight_p);
+    lstm_args.insert(peepholes_weights);
+  }
+
+  auto lstm_forward_p = handler.AcquireForwardPrimitive();
+
+  auto& astream = OneDNNContext::tls().get_stream();
+  lstm_forward_p->execute(astream, lstm_args);
+  astream.wait();
+
+  auto* hidden_onednn_data = hidden_onednn_memory_p->get_data_handle();
+  auto* hidden_data =
+      phi::funcs::to_void_cast(dev_ctx.template Alloc<Tout>(hidden));
+  if (handler.is_NTC()) {
+    handler.reorderRNNdata(hidden_onednn_data,
+                           hidden_data,
+                           input_lod,
+                           is_reverse,
+                           RNNReorderType::NTC_PP);
+  } else {
+    handler.reorderRNNdata(hidden_onednn_data,
+                           hidden_data,
+                           input_lod,
+                           is_reverse,
+                           RNNReorderType::TNC_PP);
+  }
+}
+
+template <typename T, typename Context>
+void FusionLSTMMKLDNNKernel(const Context& dev_ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& weight_x,
+                            const DenseTensor& weight_h,
+                            const DenseTensor& bias,
+                            const paddle::optional<DenseTensor>& h0,
+                            const paddle::optional<DenseTensor>& c0,
+                            bool use_peepholes,
+                            bool is_reverse,
+                            bool use_seq,
+                            const std::string& gate_activation,
+                            const std::string& cell_activation,
+                            const std::string& candidate_activation,
+                            float scale_data,
+                            float shift_data,
+                            const std::vector<float>& scale_weights,
+                            bool force_fp32_output,
+                            DenseTensor* hidden,
+                            DenseTensor* cell,
+                            DenseTensor* xx,
+                            DenseTensor* batched_input,
+                            DenseTensor* batched_hidden,
+                            DenseTensor* batched_cell,
+                            DenseTensor* reordered_h0,
+                            DenseTensor* reordered_c0,
+                            DenseTensor* checked_cell) {
+  const bool is_bf16 = std::is_same<T, phi::dtype::bfloat16>::value;
+
+  // BF16 does not support force output
+  if (!is_bf16 && force_fp32_output) {  // NOLINT
+    RunKernel<T, Context, float>(dev_ctx,
+                                 x,
+                                 weight_x,
+                                 weight_h,
+                                 bias,
+                                 h0,
+                                 c0,
+                                 use_peepholes,
+                                 is_reverse,
+                                 use_seq,
+                                 gate_activation,
+                                 cell_activation,
+                                 candidate_activation,
+                                 scale_data,
+                                 shift_data,
+                                 scale_weights,
+                                 force_fp32_output,
+                                 hidden,
+                                 cell,
+                                 xx,
+                                 batched_input,
+                                 batched_hidden,
+                                 batched_cell,
+                                 reordered_h0,
+                                 reordered_c0,
+                                 checked_cell);
+  } else {
+    RunKernel<T, Context, T>(dev_ctx,
+                             x,
+                             weight_x,
+                             weight_h,
+                             bias,
+                             h0,
+                             c0,
+                             use_peepholes,
+                             is_reverse,
+                             use_seq,
+                             gate_activation,
+                             cell_activation,
+                             candidate_activation,
+                             scale_data,
+                             shift_data,
+                             scale_weights,
+                             force_fp32_output,
+                             hidden,
+                             cell,
+                             xx,
+                             batched_input,
+                             batched_hidden,
+                             batched_cell,
+                             reordered_h0,
+                             reordered_c0,
+                             checked_cell);
+  }
+}
+
+}  // namespace fusion
+}  // namespace phi
+
+PD_REGISTER_KERNEL(fusion_lstm,
+                   OneDNN,
+                   ONEDNN,
+                   phi::fusion::FusionLSTMMKLDNNKernel,
+                   float,
+                   uint8_t,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/fluid/operators/fused/onednn/fusion_rnn_onednn.h b/paddle/phi/kernels/fusion/onednn/fusion_rnn_onednn.h
similarity index 87%
rename from paddle/fluid/operators/fused/onednn/fusion_rnn_onednn.h
rename to paddle/phi/kernels/fusion/onednn/fusion_rnn_onednn.h
index c04dd0cebeec0..d429f0b3944bb 100644
--- a/paddle/fluid/operators/fused/onednn/fusion_rnn_onednn.h
+++ b/paddle/phi/kernels/fusion/onednn/fusion_rnn_onednn.h
@@ -1,24 +1,24 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 
 #pragma once
 
-#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/backends/onednn/onednn_reuse.h"
+#include "paddle/phi/core/kernel_registry.h"
 
-namespace paddle {
-namespace operators {
+namespace phi {
+namespace fusion {
 
 using phi::funcs::CreateKey;
 using phi::funcs::OneDNNGetDataType;
@@ -28,8 +28,7 @@ using OneDNNMemoryFormat = dnnl::memory::format_tag;
 template <typename T, typename T_alg, typename T_out = T>
 class RNNMKLDNNHandler : public phi::funcs::OneDNNHandlerT<T, T_alg> {
  public:
-  RNNMKLDNNHandler(const paddle::framework::ExecutionContext& ctx,
-                   const phi::OneDNNContext& dev_ctx,
+  RNNMKLDNNHandler(const phi::OneDNNContext& dev_ctx,
                    const dnnl::engine onednn_engine UNUSED,
                    phi::Place cpu_place,
                    const phi::DenseTensor* input UNUSED,
@@ -41,7 +40,10 @@ class RNNMKLDNNHandler : public phi::funcs::OneDNNHandlerT<T, T_alg> {
                    const int64_t IC,
                    const int64_t OC,
                    const int64_t G,
-                   const std::string& unique_name)
+                   const std::string& unique_name,
+                   float scale_data,
+                   float shift_data,
+                   std::vector<float> scale_weights)
       : phi::funcs::OneDNNHandlerT<T, T_alg>(
             dev_ctx,
             dev_ctx.GetEngine(),
@@ -62,9 +64,6 @@ class RNNMKLDNNHandler : public phi::funcs::OneDNNHandlerT<T, T_alg> {
 
     if (is_INT8) {
       // Int8 attributes
-      const float scale_data = ctx.Attr<float>("Scale_data");
-      const float shift_data = ctx.Attr<float>("Shift_data");
-      const auto scale_weights = ctx.Attr<std::vector<float>>("Scale_weights");
 
       const int weights_scale_mask =
           0 +
@@ -237,5 +236,5 @@ class RNNMKLDNNHandler : public phi::funcs::OneDNNHandlerT<T, T_alg> {
   std::string memory_key_;
   dnnl::primitive_attr attr_;
 };
-}  // namespace operators
-}  // namespace paddle
+}  // namespace fusion
+}  // namespace phi
diff --git a/paddle/phi/kernels/fusion/xpu/block_multi_head_attention_kernel.cc b/paddle/phi/kernels/fusion/xpu/block_multi_head_attention_kernel.cc
new file mode 100644
index 0000000000000..42c58f60bb654
--- /dev/null
+++ b/paddle/phi/kernels/fusion/xpu/block_multi_head_attention_kernel.cc
@@ -0,0 +1,610 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <paddle/phi/backends/xpu/xpu_context.h>
+#include "glog/logging.h"
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/common/memory_utils.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/flash_attn_kernel.h"
+#include "xpu/xdnn.h"
+
+namespace phi {
+namespace fusion {
+
+template <typename Context>
+int GetMaxLen(const Context& dev_ctx,
+              const phi::DenseTensor& seq_lens_tensor,
+              phi::DenseTensor* max_len_tensor,
+              const int batch_size) {
+  int max_len_cpu = 0;
+  int r = baidu::xpu::api::reduce_max<int>(dev_ctx.x_context(),
+                                           seq_lens_tensor.data<int>(),
+                                           max_len_tensor->data<int>(),
+                                           {batch_size},
+                                           {0});
+  PD_CHECK(r == 0, "baidu::xpu::api::reduce_max failed.");
+  xpu_wait(dev_ctx.x_context()->xpu_stream);
+  r = xpu_memcpy(&max_len_cpu,
+                 max_len_tensor->data<int>(),
+                 sizeof(int),
+                 XPUMemcpyKind::XPU_DEVICE_TO_HOST);
+  PD_CHECK(r == 0, "xpu_memcpy failed.");
+  return max_len_cpu;
+}
+
+template <typename T, typename Context>
+void qkv_split_rope_kernel(
+    const Context& xpu_ctx,
+    const DenseTensor& qkv_input,
+    const DenseTensor& rotary_emb,
+    const DenseTensor& seq_lens,
+    const baidu::xpu::api::VectorParam<int32_t>& lods,
+    const baidu::xpu::api::VectorParam<int32_t>& pos_emb_offset,
+    int bsz,
+    int max_seq_len,
+    int token_num,
+    int num_head,
+    int dim_head,
+    DenseTensor* q_out,
+    DenseTensor* k_out,
+    DenseTensor* v_out) {
+  xpu::ctx_guard RAII_GUARD(xpu_ctx.x_context());
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  auto q_data = reinterpret_cast<XPUType*>(q_out->data<T>());
+  auto k_data = reinterpret_cast<XPUType*>(k_out->data<T>());
+  auto v_data = reinterpret_cast<XPUType*>(v_out->data<T>());
+  int r = baidu::xpu::api::split<XPUType>(
+      xpu_ctx.x_context(),
+      reinterpret_cast<const XPUType*>(qkv_input.data<T>()),
+      {q_data, k_data, v_data},
+      {token_num, 3, num_head * dim_head},
+      {1, 1, 1},
+      1);
+  const_cast<DenseTensor*>(&qkv_input)->clear();
+  PD_CHECK(r == 0, "baidu::xpu::api::split failed.");
+  r = baidu::xpu::api::vsl_rotary_neox_embedding<XPUType, float, int32_t>(
+      xpu_ctx.x_context(),
+      q_data,
+      k_data,
+      rotary_emb.data<float>(),
+      q_data,
+      k_data,
+      lods,
+      1,
+      max_seq_len,
+      num_head,
+      dim_head,
+      "BLHD",
+      pos_emb_offset,
+      "NORMAL",
+      -1);
+  PD_CHECK(r == 0, "baidu::xpu::api::vsl_rotary_neox_embedding failed.");
+}
+
+template <typename T, typename Context>
+void BlockMultiheadAttentionXPUKernel(
+    const Context& dev_ctx,
+    const DenseTensor& qkv,
+    const DenseTensor& key_cache,
+    const DenseTensor& value_cache,
+    const DenseTensor& seq_lens_encoder,
+    const DenseTensor& seq_lens_decoder,
+    const DenseTensor& seq_lens_this_time,
+    const DenseTensor& padding_offsets,
+    const DenseTensor& cum_offsets,
+    const DenseTensor& cu_seqlens_q,
+    const DenseTensor& cu_seqlens_k,
+    const DenseTensor& block_tables,
+    const DenseTensor& cache_k_per_batch_maxs,
+    const DenseTensor& cache_v_per_batch_maxs,
+    const paddle::optional<DenseTensor>& pre_key_cache,
+    const paddle::optional<DenseTensor>& pre_value_cache,
+    const paddle::optional<DenseTensor>& rope_emb,
+    const paddle::optional<DenseTensor>& mask,
+    const paddle::optional<DenseTensor>& tgt_mask,
+    const paddle::optional<DenseTensor>& cache_k_quant_scales,
+    const paddle::optional<DenseTensor>& cache_v_quant_scales,
+    const paddle::optional<DenseTensor>& cache_k_dequant_scales,
+    const paddle::optional<DenseTensor>& cache_v_dequant_scales,
+    const paddle::optional<DenseTensor>& qkv_out_scale,
+    const paddle::optional<DenseTensor>& qkv_bias,
+    const paddle::optional<DenseTensor>& out_shift,
+    const paddle::optional<DenseTensor>& out_smooth,
+    const paddle::optional<DenseTensor>& max_enc_len_this_time,
+    const paddle::optional<DenseTensor>& max_dec_len_this_time,
+    int max_seq_len,
+    int block_size,
+    bool use_neox_style,
+    const bool dynamic_cachekv_quant,
+    const int quant_round_type,
+    const float quant_max_bound,
+    const float quant_min_bound,
+    const float out_scale,
+    const std::string& compute_dtype,
+    DenseTensor* fmha_out,
+    DenseTensor* qkv_out,
+    DenseTensor* key_cache_out,
+    DenseTensor* value_cache_out) {
+  xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+  auto xpu_context = dev_ctx.x_context();
+
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
+  phi::DenseTensor qkv_buf;
+  phi::DenseTensor fmha_buf;
+  VLOG(3) << "fmha_out " << fmha_out->dims();
+  if (out_scale <= 0) {
+    dev_ctx.template Alloc<T>(fmha_out);
+    fmha_buf = *fmha_out;
+  } else {
+    PADDLE_THROW(phi::errors::Unimplemented("Not supports out_scale > 0."));
+  }
+  int r = xpu::constant<XPUType>(xpu_context,
+                                 reinterpret_cast<XPUType*>(fmha_buf.data<T>()),
+                                 fmha_buf.numel(),
+                                 0);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
+  const auto& input_dims = qkv.dims();
+  const auto& key_cache_dims = key_cache.dims();
+  const int token_num = input_dims[0];
+  const int num_head = key_cache_dims[1];
+  const int dim_head = key_cache_dims[3];
+  const int bsz = cum_offsets.dims()[0];
+  const int max_block_per_seq = block_tables.dims()[1];
+  VLOG(3) << "bsz: " << bsz << " token_num: " << token_num
+          << " num_head: " << num_head << " dim_head: " << dim_head
+          << " max_block_per_seq: " << max_block_per_seq;
+  VLOG(3) << "fmha_out_dims: " << fmha_out->dims();
+  bool causual = true;
+  if (mask) {
+    causual = false;
+  }
+  bool use_pre_cache = false;
+  int pre_cache_length = 0;
+  if (pre_key_cache) {
+    PADDLE_THROW(phi::errors::Unimplemented("Not supports pre_key_cache now."));
+  }
+  VLOG(3) << "token_num: " << token_num
+          << " pre_cache_length: " << pre_cache_length;
+
+  int max_dec_len_this_time_data(0);
+  if (!max_dec_len_this_time) {
+    phi::DenseTensor max_dec_len_tensor;
+    max_dec_len_tensor.Resize({{1}});
+    dev_ctx.template Alloc<int>(&max_dec_len_tensor,
+                                max_dec_len_tensor.numel() * sizeof(int));
+    max_dec_len_this_time_data =
+        GetMaxLen(dev_ctx, seq_lens_decoder, &max_dec_len_tensor, bsz);
+  } else {
+    PADDLE_ENFORCE_EQ(
+        max_dec_len_this_time.get().place().GetType(),
+        phi::AllocationType::CPU,
+        errors::InvalidArgument(
+            "The place of input max_dec_len_this_time must be CPU, but got %s.",
+            max_dec_len_this_time.get().place()));
+    max_dec_len_this_time_data = *max_dec_len_this_time.get().data<int>();
+  }
+  int max_enc_len_this_time_data(0);
+  if (!max_enc_len_this_time) {
+    phi::DenseTensor max_enc_len_tensor;
+    max_enc_len_tensor.Resize({{1}});
+    dev_ctx.template Alloc<int>(&max_enc_len_tensor,
+                                max_enc_len_tensor.numel() * sizeof(int));
+    max_enc_len_this_time_data =
+        GetMaxLen(dev_ctx, seq_lens_encoder, &max_enc_len_tensor, bsz);
+  } else {
+    PADDLE_ENFORCE_EQ(
+        max_enc_len_this_time.get().place().GetType(),
+        phi::AllocationType::CPU,
+        errors::InvalidArgument(
+            "The place of input max_enc_len_this_time must be CPU, but got %s.",
+            max_enc_len_this_time.get().place()));
+    max_enc_len_this_time_data = *max_enc_len_this_time.get().data<int>();
+  }
+
+  const int MAXPTR_N = xpu_context->max_ptr_size();
+  VLOG(3) << "max_len end";
+  phi::DenseTensor unpadding_q, unpadding_k, unpadding_v;
+  phi::DenseTensor softmax_out, softmax_lse, seed_offset;
+  phi::DenseTensor q_trans, k_trans, v_trans, qktv_out;
+  if (!use_pre_cache) {
+    unpadding_q.Resize({{token_num, num_head, dim_head}});
+    unpadding_k.Resize({{token_num, num_head, dim_head}});
+    unpadding_v.Resize({{token_num, num_head, dim_head}});
+
+    dev_ctx.template Alloc<T>(&unpadding_q, unpadding_q.numel() * sizeof(T));
+    dev_ctx.template Alloc<T>(&unpadding_k, unpadding_k.numel() * sizeof(T));
+    dev_ctx.template Alloc<T>(&unpadding_v, unpadding_v.numel() * sizeof(T));
+  } else {
+    PADDLE_THROW(phi::errors::Unimplemented("Not supports pre_key_cache now."));
+  }
+  VLOG(3) << "encoder";
+  VLOG(3) << "max_enc_len_this_time_data: " << max_enc_len_this_time_data;
+  if (qkv_out_scale) {
+    PADDLE_THROW(phi::errors::Unimplemented("Not supports qkv_out_scale now."));
+  } else {
+    VLOG(1) << "qkv_out_scale is none";
+    qkv_buf = qkv;
+  }
+  if (qkv_bias) {
+    PADDLE_THROW(phi::errors::Unimplemented("Not supports qkv_bias now."));
+  }
+  std::vector<int> lods_cpu(bsz + 1, 0);
+  xpu_wait(xpu_context->xpu_stream);
+  xpu_memcpy(lods_cpu.data() + 1,
+             seq_lens_this_time.data<int>(),
+             sizeof(int32_t) * bsz,
+             XPUMemcpyKind::XPU_DEVICE_TO_HOST);
+  for (int i = 1; i < bsz + 1; i++) {
+    lods_cpu[i] += lods_cpu[i - 1];
+  }
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  baidu::xpu::api::VectorParam<int32_t> lods =
+      baidu::xpu::api::VectorParam<int32_t>{lods_cpu.data(), bsz + 1, nullptr}
+          .to_xpu(RAII_GUARD);
+  float* p_batch_max_ptrs = RAII_GUARD.alloc_l3_or_gm<float>(bsz);
+
+  if (!rope_emb || !use_neox_style) {
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "only supports use_neox_style rope_emb now."));
+  }
+  if (max_enc_len_this_time_data > 0) {
+    // const int* sequence_lengths_data = seq_lens_encoder.data<int>();
+    xpu::VectorParam<int32_t> pos_emb_offset =
+        xpu::VectorParam<int32_t>{nullptr, 0, nullptr};
+    qkv_split_rope_kernel<T, Context>(dev_ctx,
+                                      qkv,
+                                      rope_emb.get(),
+                                      seq_lens_encoder,
+                                      lods,
+                                      pos_emb_offset,
+                                      bsz,
+                                      rope_emb.get().dims()[2],
+                                      token_num,
+                                      num_head,
+                                      dim_head,
+                                      &unpadding_q,
+                                      &unpadding_k,
+                                      &unpadding_v);
+
+    VLOG(3) << "rope end";
+    VLOG(3) << "causual: " << causual;
+    if (!use_pre_cache) {
+      phi::FlashAttnUnpaddedKernel<T>(dev_ctx,
+                                      unpadding_q,
+                                      unpadding_k,
+                                      unpadding_v,
+                                      cu_seqlens_q,
+                                      cu_seqlens_k,
+                                      paddle::none /*fixed_seed_offset*/,
+                                      causual ? paddle::none : mask,
+                                      max_enc_len_this_time_data,
+                                      max_enc_len_this_time_data,
+                                      1.0f / sqrt(static_cast<float>(dim_head)),
+                                      0.0,
+                                      causual,
+                                      false,
+                                      true /* is_test*/,
+                                      "" /*rng_name*/,
+                                      &fmha_buf,
+                                      &softmax_out,
+                                      &softmax_lse,
+                                      &seed_offset);
+    } else {
+      PADDLE_THROW(
+          phi::errors::Unimplemented("Not supports use_pre_cache now."));
+    }
+    VLOG(3) << "flash end";
+    if (cache_k_quant_scales && dynamic_cachekv_quant) {
+      PADDLE_THROW(phi::errors::Unimplemented("Not supports quant now."));
+    } else {
+      std::vector<int32_t> start_token_ctx(bsz, 0);
+      xpu::VectorParam<int32_t> start_token_ctx_VP =
+          xpu::VectorParam<int32_t>{
+              start_token_ctx.data(),
+              static_cast<int64_t>(start_token_ctx.size()),
+              nullptr}
+              .to_xpu(RAII_GUARD);
+
+      std::vector<int32_t> ordered_index_ctx(bsz, 0);
+      std::iota(ordered_index_ctx.begin(), ordered_index_ctx.end(), 0);
+      xpu::VectorParam<int32_t> ordered_index_ctx_VP =
+          xpu::VectorParam<int32_t>{
+              ordered_index_ctx.data(), static_cast<int64_t>(bsz), nullptr}
+              .to_xpu(RAII_GUARD);
+      int ret = xpu::reshape_cached_kv<XPUType, XPUType, int32_t>(
+          xpu_context,
+          reinterpret_cast<const XPUType*>(unpadding_k.data<T>()),
+          reinterpret_cast<XPUType*>(const_cast<T*>(key_cache.data<T>())),
+          block_tables.data<int>(),
+          lods,
+          start_token_ctx_VP,
+          ordered_index_ctx_VP,
+          bsz,
+          num_head,
+          dim_head,
+          bsz,
+          block_size,
+          max_block_per_seq,
+          "BLHD",
+          "HLD");
+      PADDLE_ENFORCE_XDNN_SUCCESS(ret, "reshape_cached_kv");
+      ret = xpu::batch_findmax<XPUType>(
+          xpu_context,
+          reinterpret_cast<XPUType*>(const_cast<T*>(key_cache.data<T>())),
+          token_num,
+          num_head * dim_head,
+          bsz,
+          lods.xpu,
+          p_batch_max_ptrs);
+      PADDLE_ENFORCE_XDNN_SUCCESS(ret, "batch_findmax");
+      ret = xpu::copy2d<float>(
+          xpu_context,
+          p_batch_max_ptrs,
+          const_cast<float*>(cache_k_per_batch_maxs.data<float>()),
+          bsz,
+          1,
+          MAXPTR_N,
+          1);
+      PADDLE_ENFORCE_XDNN_SUCCESS(ret, "copy2d");
+      ret = xpu::reshape_cached_kv<XPUType, XPUType, int32_t>(
+          xpu_context,
+          reinterpret_cast<const XPUType*>(unpadding_v.data<T>()),
+          reinterpret_cast<XPUType*>(const_cast<T*>(value_cache.data<T>())),
+          block_tables.data<int>(),
+          lods,
+          start_token_ctx_VP,
+          ordered_index_ctx_VP,
+          bsz,
+          num_head,
+          dim_head,
+          bsz,
+          block_size,
+          max_block_per_seq,
+          "BLHD",
+          "HLD");
+      PADDLE_ENFORCE_XDNN_SUCCESS(ret, "reshape_cached_kv");
+      ret = xpu::batch_findmax<XPUType>(
+          xpu_context,
+          reinterpret_cast<XPUType*>(const_cast<T*>(value_cache.data<T>())),
+          token_num,
+          num_head * dim_head,
+          bsz,
+          lods.xpu,
+          p_batch_max_ptrs);
+      PADDLE_ENFORCE_XDNN_SUCCESS(ret, "batch_findmax");
+      ret = xpu::copy2d<float>(
+          xpu_context,
+          p_batch_max_ptrs,
+          const_cast<float*>(cache_v_per_batch_maxs.data<float>()),
+          bsz,
+          1,
+          MAXPTR_N,
+          1);
+      PADDLE_ENFORCE_XDNN_SUCCESS(ret, "copy2d");
+    }
+    VLOG(3) << "cache end";
+  }
+  VLOG(3) << "encoder done";
+  VLOG(3) << "max_dec_len_this_time_data: " << max_dec_len_this_time_data;
+
+  if (max_dec_len_this_time_data > 0) {
+    int cachekv_quant_mode = 0;
+    if (cache_k_quant_scales || cachekv_quant_mode) {
+      PADDLE_THROW(phi::errors::Unimplemented(
+          "Not supports cache_k_quant_scales or cachekv_quant_mode now."));
+    }
+    std::vector<int> lods_decoder_cpu(bsz + 1, 0);
+    xpu_wait(xpu_context->xpu_stream);
+    xpu_memcpy(lods_decoder_cpu.data() + 1,
+               seq_lens_decoder.data<int>(),
+               sizeof(int32_t) * bsz,
+               XPUMemcpyKind::XPU_DEVICE_TO_HOST);
+    for (int i = 1; i < bsz + 1; i++) {
+      lods_decoder_cpu[i] += lods_decoder_cpu[i - 1];
+    }
+    std::vector<int32_t> kv_seq_lod_dec(bsz + 1, 0);
+    std::iota(kv_seq_lod_dec.begin(), kv_seq_lod_dec.end(), 0);
+    xpu::VectorParam<int32_t> kv_seq_lod_dec_VP =
+        xpu::VectorParam<int32_t>{kv_seq_lod_dec.data(),
+                                  static_cast<int64_t>(kv_seq_lod_dec.size()),
+                                  nullptr}
+            .to_xpu(RAII_GUARD);
+    std::vector<int32_t> start_token_ctx(bsz, 0);
+    for (int i = 0; i < bsz; i++) {
+      start_token_ctx[i] = lods_decoder_cpu[i + 1] - lods_decoder_cpu[i];
+    }
+    xpu::VectorParam<int32_t> start_token_ctx_VP =
+        xpu::VectorParam<int32_t>{start_token_ctx.data(),
+                                  static_cast<int64_t>(start_token_ctx.size()),
+                                  nullptr}
+            .to_xpu(RAII_GUARD);
+    qkv_split_rope_kernel<T, Context>(dev_ctx,
+                                      qkv,
+                                      rope_emb.get(),
+                                      seq_lens_encoder,
+                                      lods,
+                                      start_token_ctx_VP,
+                                      bsz,
+                                      rope_emb.get().dims()[2],
+                                      token_num,
+                                      num_head,
+                                      dim_head,
+                                      &unpadding_q,
+                                      &unpadding_k,
+                                      &unpadding_v);
+
+    std::vector<int32_t> ordered_index_ctx(bsz, 0);
+    std::iota(ordered_index_ctx.begin(), ordered_index_ctx.end(), 0);
+    xpu::VectorParam<int32_t> ordered_index_ctx_VP =
+        xpu::VectorParam<int32_t>{
+            ordered_index_ctx.data(), static_cast<int64_t>(bsz), nullptr}
+            .to_xpu(RAII_GUARD);
+
+    float* p_batch_max_ptrs_fill =
+        RAII_GUARD.alloc_l3_or_gm<float>(bsz * MAXPTR_N);
+    int ret = xpu::constant<float>(
+        xpu_context, p_batch_max_ptrs_fill, bsz * MAXPTR_N, 0.0);
+    PADDLE_ENFORCE_XDNN_SUCCESS(ret, "constant");
+    float* p_cache_k_max_data = RAII_GUARD.alloc_l3_or_gm<float>(MAXPTR_N);
+    float* p_cache_v_max_data = RAII_GUARD.alloc_l3_or_gm<float>(MAXPTR_N);
+    ret = xpu::reshape_cached_kv<XPUType, XPUType, int32_t>(
+        xpu_context,
+        reinterpret_cast<const XPUType*>(unpadding_k.data<T>()),
+        reinterpret_cast<XPUType*>(const_cast<T*>(key_cache.data<T>())),
+        block_tables.data<int>(),
+        kv_seq_lod_dec_VP,
+        start_token_ctx_VP,
+        ordered_index_ctx_VP,
+        bsz,
+        num_head,
+        dim_head,
+        bsz,
+        block_size,
+        max_block_per_seq,
+        "BLHD",
+        "HLD");
+    PADDLE_ENFORCE_XDNN_SUCCESS(ret, "reshape_cached_kv");
+    ret = xpu::batch_findmax<XPUType>(
+        xpu_context,
+        reinterpret_cast<XPUType*>(unpadding_k.data<T>()),
+        bsz,
+        num_head * dim_head,
+        p_batch_max_ptrs);
+    PADDLE_ENFORCE_XDNN_SUCCESS(ret, "batch_findmax");
+    unpadding_k.clear();
+    ret = xpu::copy2d<float>(xpu_context,
+                             p_batch_max_ptrs,
+                             p_batch_max_ptrs_fill,
+                             bsz,
+                             1,
+                             MAXPTR_N,
+                             1);
+    PADDLE_ENFORCE_XDNN_SUCCESS(ret, "copy2d");
+    ret = xpu::max<float>(
+        xpu_context,
+        cache_k_per_batch_maxs.data<float>(),
+        p_batch_max_ptrs_fill,
+        const_cast<float*>(cache_k_per_batch_maxs.data<float>()),
+        bsz * MAXPTR_N);
+    PADDLE_ENFORCE_XDNN_SUCCESS(ret, "max");
+    ret = xpu::findmax<float>(
+        xpu_context,
+        const_cast<float*>(cache_k_per_batch_maxs.data<float>()),
+        p_cache_k_max_data,
+        bsz * MAXPTR_N);
+    PADDLE_ENFORCE_XDNN_SUCCESS(ret, "findmax");
+    ret = xpu::reshape_cached_kv<XPUType, XPUType, int32_t>(
+        xpu_context,
+        reinterpret_cast<const XPUType*>(unpadding_v.data<T>()),
+        reinterpret_cast<XPUType*>(const_cast<T*>(value_cache.data<T>())),
+        block_tables.data<int>(),
+        kv_seq_lod_dec_VP,
+        start_token_ctx_VP,
+        ordered_index_ctx_VP,
+        bsz,
+        num_head,
+        dim_head,
+        bsz,
+        block_size,
+        max_block_per_seq,
+        "BLHD",
+        "HLD");
+    PADDLE_ENFORCE_XDNN_SUCCESS(ret, "reshape_cached_kv");
+    ret = xpu::batch_findmax<XPUType>(
+        xpu_context,
+        reinterpret_cast<XPUType*>(unpadding_v.data<T>()),
+        bsz,
+        num_head * dim_head,
+        p_batch_max_ptrs);
+    PADDLE_ENFORCE_XDNN_SUCCESS(ret, "batch_findmax");
+    unpadding_v.clear();
+    ret = xpu::copy2d<float>(xpu_context,
+                             p_batch_max_ptrs,
+                             p_batch_max_ptrs_fill,
+                             bsz,
+                             1,
+                             MAXPTR_N,
+                             1);
+    PADDLE_ENFORCE_XDNN_SUCCESS(ret, "copy2d");
+    ret = xpu::max<float>(
+        xpu_context,
+        cache_v_per_batch_maxs.data<float>(),
+        p_batch_max_ptrs_fill,
+        const_cast<float*>(cache_v_per_batch_maxs.data<float>()),
+        bsz * MAXPTR_N);
+    PADDLE_ENFORCE_XDNN_SUCCESS(ret, "max");
+    ret = xpu::findmax<float>(
+        xpu_context,
+        const_cast<float*>(cache_v_per_batch_maxs.data<float>()),
+        p_cache_v_max_data,
+        bsz * MAXPTR_N);
+    PADDLE_ENFORCE_XDNN_SUCCESS(ret, "findmax");
+
+    VLOG(1) << "cachekv_quant_mode " << cachekv_quant_mode;
+    std::vector<int32_t> qkvlod_dec(2 * (bsz + 1), 0);
+    for (int bs = 0; bs < bsz; bs++) {
+      qkvlod_dec[bs + 1] = bs + 1;
+      qkvlod_dec[bsz + 1 + bs + 1] = lods_decoder_cpu[bs + 1] + bs + 1;
+    }
+    auto qkvlod_dec_vp =
+        xpu::VectorParam<int32_t>{
+            qkvlod_dec.data(), static_cast<int64_t>(qkvlod_dec.size()), nullptr}
+            .to_xpu(RAII_GUARD);
+    xpu::DecodeAttnParam decoder_attn_vsl_param(
+        qkvlod_dec_vp, max_seq_len, num_head, dim_head, -1, 0, bsz, {});
+    xpu::PageAttnParam<int> page_param(
+        block_size, bsz, max_block_per_seq, ordered_index_ctx_VP, 0, "HLD");
+    float* max_q_ptr = RAII_GUARD.alloc_l3_or_gm<float>(MAXPTR_N);
+    ret = xpu::findmax<XPUType>(xpu_context,
+                                reinterpret_cast<XPUType*>(unpadding_q.data()),
+                                max_q_ptr,
+                                token_num * num_head * dim_head);
+
+    ret = xpu::qkv_paged_attention<XPUType,
+                                   XPUType,
+                                   XPUType,
+                                   XPUType,
+                                   int16_t,
+                                   float,
+                                   int>(
+        xpu_context,
+        reinterpret_cast<XPUType*>(unpadding_q.data()),
+        reinterpret_cast<XPUType*>(const_cast<T*>(key_cache.data<T>())),
+        reinterpret_cast<XPUType*>(const_cast<T*>(value_cache.data<T>())),
+        block_tables.data<int>(),  // [pagep.max_batch_size,
+                                   // pagep.max_num_blocks_per_seq]
+        reinterpret_cast<XPUType*>(fmha_buf.data<T>()),
+        max_q_ptr,
+        p_cache_k_max_data,  // shape=[6], nullptr if pagep.quant_type == 1
+        p_cache_v_max_data,  // shape=[6], nullptr if pagep.quant_type == 1
+        nullptr,
+        decoder_attn_vsl_param,  // attention 相关参数
+        page_param);             // page attention 相关参数
+    PADDLE_ENFORCE_XDNN_SUCCESS(ret, "qkv_paged_attention");
+  }
+  VLOG(3) << "decoder done";
+}
+}  // namespace fusion
+}  // namespace phi
+
+PD_REGISTER_KERNEL(block_multihead_attention_xpu,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::fusion::BlockMultiheadAttentionXPUKernel,
+                   phi::dtype::float16) {
+  kernel->InputAt(26).SetBackend(phi::Backend::CPU);
+  kernel->InputAt(27).SetBackend(phi::Backend::CPU);
+}
diff --git a/paddle/phi/kernels/fusion/xpu/fused_layernorm_kernel.cc b/paddle/phi/kernels/fusion/xpu/fused_layernorm_kernel.cc
index 833caa6688787..cac0182feaa2b 100644
--- a/paddle/phi/kernels/fusion/xpu/fused_layernorm_kernel.cc
+++ b/paddle/phi/kernels/fusion/xpu/fused_layernorm_kernel.cc
@@ -63,6 +63,11 @@ void FusedLayerNormKernel(const Context& dev_ctx,
 
   dev_ctx.template Alloc<float>(&residual_alpha_tmp);
   dev_ctx.template Alloc<T>(&residual_alpha_ptr);
+  r = baidu::xpu::api::constant(xpu_ctx->x_context(),
+                                reinterpret_cast<XPUType*>(out->data<T>()),
+                                out->numel(),
+                                static_cast<XPUType>(0.f));
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
 
   r = baidu::xpu::api::constant(xpu_ctx->x_context(),
                                 residual_alpha_tmp.data<float>(),
diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
index 594eefe5b8de1..ecfd46852c134 100644
--- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
@@ -209,9 +209,6 @@ DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Swish, CudaSwishGradFunctor);
 DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu,
                                                CudaLeakyReluGradFunctor,
                                                alpha);
-DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(ThresholdedRelu,
-                                               CudaThresholdedReluGradFunctor,
-                                               threshold);
 DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink,
                                                CudaSoftShrinkGradFunctor,
                                                lambda);
@@ -247,7 +244,10 @@ DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(HardSigmoid,
                                                  CudaHardSigmoidGradFunctor,
                                                  slope,
                                                  offset);
-
+DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(ThresholdedRelu,
+                                               CudaThresholdedReluGradFunctor,
+                                               threshold,
+                                               value);
 template <typename T, typename Context>
 void SiluGradKernel(const Context& dev_ctx,
                     const DenseTensor& x,
diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu
index 1bf3d92d80620..aa874c5e0dd81 100644
--- a/paddle/phi/kernels/gpu/activation_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_kernel.cu
@@ -123,9 +123,6 @@ DEFINE_GPU_ACTIVATION_KERNEL_WITH_INT_IN_FLOAT_OUT(Expm1, CudaExpm1Functor)
 
 DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, CudaLeakyReluFunctor, alpha)
 DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(LogitCUDA, CudaLogitFunctor, eps)
-DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu,
-                                     CudaThresholdedReluFunctor,
-                                     threshold)
 DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(HardShrink,
                                      CudaHardShrinkFunctor,
                                      threshold)
@@ -148,6 +145,10 @@ DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(HardSigmoid,
                                      slope,
                                      offset)
 DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(Selu, CudaSeluFunctor, scale, alpha)
+DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(ThresholdedRelu,
+                                     CudaThresholdedReluFunctor,
+                                     threshold,
+                                     value)
 
 template <typename T, typename Context>
 void HardSwishKernel(const Context& dev_ctx,
diff --git a/paddle/phi/kernels/gpu/fill_diagonal_grad_kernel.cu b/paddle/phi/kernels/gpu/fill_diagonal_grad_kernel.cu
index 564090490f847..3a020e4359d9d 100644
--- a/paddle/phi/kernels/gpu/fill_diagonal_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/fill_diagonal_grad_kernel.cu
@@ -60,7 +60,7 @@ void FillDiagonalGradKernel(const Context& ctx,
   auto strides = funcs::CalStride(out_dims);
 
   auto wrapsize = std::min(size, out_dims[1] * out_dims[1]);
-  // The wrap mode supported only the dims equels to 2; In wrap mode, the
+  // The wrap mode supported only the dims equals to 2; In wrap mode, the
   // value will be filled in cycles
   if (wrap) {
     wrapsize = size;
diff --git a/paddle/phi/kernels/gpu/fill_diagonal_kernel.cu b/paddle/phi/kernels/gpu/fill_diagonal_kernel.cu
index aed2380628ceb..8a06bd33fa4f3 100644
--- a/paddle/phi/kernels/gpu/fill_diagonal_kernel.cu
+++ b/paddle/phi/kernels/gpu/fill_diagonal_kernel.cu
@@ -61,7 +61,7 @@ void FillDiagonalKernel(const Context& ctx,
   auto out_dims = out->dims();
   auto strides = funcs::CalStride(out_dims);
 
-  // The wrap mode supported only the dims equels to 2; In wrap mode, the
+  // The wrap mode supported only the dims equals to 2; In wrap mode, the
   // value will be filled in cycles
   if (!wrap) {
     size = std::min(size, out_dims[1] * out_dims[1]);
diff --git a/paddle/phi/kernels/gpu/group_norm_kernel.cu b/paddle/phi/kernels/gpu/group_norm_kernel.cu
index 4835b643efcc7..720447ea41a0e 100644
--- a/paddle/phi/kernels/gpu/group_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/group_norm_kernel.cu
@@ -123,6 +123,17 @@ inline __device__ void UpdateSum(const T* srcX, float* sum, float* sumSq) {
   *sumSq += src_data * src_data;
 }
 
+template <typename T, int THREADS_PER_CHANNEL>
+inline __device__ void UpdateSum(const T* srcX,
+                                 const T* srcR,
+                                 float* sum,
+                                 float* sumSq) {
+  float src_data = phi::__2float<T>(*srcX);
+  float srcy_data = phi::__2float<T>(*srcR);
+  *sum += src_data + srcy_data;
+  *sumSq += (src_data + srcy_data) * (src_data + srcy_data);
+}
+
 template <>
 inline __device__ void UpdateSum<__half, 2>(const __half* srcX,
                                             float* sum,
@@ -133,6 +144,20 @@ inline __device__ void UpdateSum<__half, 2>(const __half* srcX,
   *sumSq += f2.x * f2.x + f2.y * f2.y;
 }
 
+template <>
+inline __device__ void UpdateSum<__half, 2>(const __half* srcX,
+                                            const __half* srcR,
+                                            float* sum,
+                                            float* sumSq) {
+  __half2 h2 = *reinterpret_cast<__half2 const*>(srcX);
+  __half2 h2_r = *reinterpret_cast<__half2 const*>(srcR);
+  float2 f2 = __half22float2(h2);
+  float2 f2_r = __half22float2(h2_r);
+  *sum += f2.x + f2_r.x + f2.y + f2_r.y;
+  *sumSq +=
+      (f2.x + f2_r.x) * (f2.x + f2_r.x) + (f2.y + f2_r.y) * (f2.y + f2_r.y);
+}
+
 template <>
 inline __device__ void UpdateSum<phi::dtype::float16, 2>(
     const phi::dtype::float16* srcX, float* sum, float* sumSq) {
@@ -142,6 +167,21 @@ inline __device__ void UpdateSum<phi::dtype::float16, 2>(
   *sumSq += f2.x * f2.x + f2.y * f2.y;
 }
 
+template <>
+inline __device__ void UpdateSum<phi::dtype::float16, 2>(
+    const phi::dtype::float16* srcX,
+    const phi::dtype::float16* srcR,
+    float* sum,
+    float* sumSq) {
+  __half2 h2 = *reinterpret_cast<__half2 const*>(srcX);
+  __half2 h2_r = *reinterpret_cast<__half2 const*>(srcR);
+  float2 f2 = __half22float2(h2);
+  float2 f2_r = __half22float2(h2_r);
+  *sum += f2.x + f2_r.x + f2.y + f2_r.y;
+  *sumSq +=
+      (f2.x + f2_r.x) * (f2.x + f2_r.x) + (f2.y + f2_r.y) * (f2.y + f2_r.y);
+}
+
 #ifdef PADDLE_CUDA_BF16
 template <>
 inline __device__ void UpdateSum<phi::dtype::bfloat16, 2>(
@@ -151,6 +191,21 @@ inline __device__ void UpdateSum<phi::dtype::bfloat16, 2>(
   *sum += f2.x + f2.y;
   *sumSq += f2.x * f2.x + f2.y * f2.y;
 }
+
+template <>
+inline __device__ void UpdateSum<phi::dtype::bfloat16, 2>(
+    const phi::dtype::bfloat16* srcX,
+    const phi::dtype::bfloat16* srcR,
+    float* sum,
+    float* sumSq) {
+  __nv_bfloat162 h2 = *reinterpret_cast<__nv_bfloat162 const*>(srcX);
+  __nv_bfloat162 h2_r = *reinterpret_cast<__nv_bfloat162 const*>(srcR);
+  float2 f2 = phi::bfloat1622float2(h2);
+  float2 f2_r = phi::bfloat1622float2(h2_r);
+  *sum += f2.x + f2_r.x + f2.y + f2_r.y;
+  *sumSq +=
+      (f2.x + f2_r.x) * (f2.x + f2_r.x) + (f2.y + f2_r.y) * (f2.y + f2_r.y);
+}
 #endif
 
 template <typename T, int THREADS_PER_BLOCK>
@@ -177,7 +232,13 @@ __global__ void groupNormNDHWCSumSingerChannelKernel(
     int64_t offset = static_cast<int64_t>(ni) * params.dhwc +
                      static_cast<int64_t>(dhwi) * params.c + ci;
     float src_data = *reinterpret_cast<float const*>(&params.srcX[offset]);
-    UpdateSum<T, 1>(&params.srcX[offset], &sum, &sumSq);
+    if (params.srcR != nullptr) {
+      int64_t g_offset = params.y_same_with_x ? offset : ci;
+      UpdateSum<T, 1>(
+          &params.srcX[offset], &params.srcR[g_offset], &sum, &sumSq);
+    } else {
+      UpdateSum<T, 1>(&params.srcX[offset], &sum, &sumSq);
+    }
   }
 
   smem[threadIdx.x] = make_float2(sum, sumSq);
@@ -185,7 +246,6 @@ __global__ void groupNormNDHWCSumSingerChannelKernel(
   __syncthreads();
 
   float2 sums = smem[threadIdx.x];
-
   atomicAdd(&params.redBuffer[(2 * ni + 0) * params.groups + ci],
             sums.x * params.invDHWC);
   atomicAdd(&params.redBuffer[(2 * ni + 1) * params.groups + ci], sums.y);
@@ -209,7 +269,8 @@ __global__ void groupNormNDHWCSumKernel(const GroupNormNDHWCParams<T> params) {
   if (ci >= params.c || threadIdx.x * THREADS_PER_CHANNEL >= params.cPerBlock) {
     return;
   }
-  // The first activation loaded by that block.
+  int32_t gj = ci / params.cPerGroup;
+  int32_t cj = ci % params.cPerGroup;
   int32_t dhwBegin = blockIdx.y * params.dhwPerBlock;
   // The last activation loaded by that block.
   int32_t dhwEnd = min(dhwBegin + params.dhwPerBlock, params.dhw);
@@ -223,13 +284,19 @@ __global__ void groupNormNDHWCSumKernel(const GroupNormNDHWCParams<T> params) {
     int64_t offset = static_cast<int64_t>(ni) * params.dhwc +
                      static_cast<int64_t>(dhwi) * params.c + ci;
     float src_data = *reinterpret_cast<float const*>(&params.srcX[offset]);
-    UpdateSum<T, THREADS_PER_CHANNEL>(&params.srcX[offset], &sum, &sumSq);
+    if (params.srcR != nullptr) {
+      int64_t g_offset =
+          params.y_same_with_x ? offset : gj * params.cPerGroup + cj;
+      UpdateSum<T, THREADS_PER_CHANNEL>(
+          &params.srcX[offset], &params.srcR[g_offset], &sum, &sumSq);
+    } else {
+      UpdateSum<T, THREADS_PER_CHANNEL>(&params.srcX[offset], &sum, &sumSq);
+    }
   }
 
   // The group that thread works on and the channel in the group (modulus).
   int32_t gi =
       ci / params.cPerGroup - blockIdx.x * params.cPerBlock / params.cPerGroup;
-  int32_t cj = ci % params.cPerGroup;
   int flag = (cj == 0 || threadIdx.x == 0) ? 1 : 0;
   GroupSums inp{flag, sum, sumSq};
   GroupSums out;
@@ -243,7 +310,6 @@ __global__ void groupNormNDHWCSumKernel(const GroupNormNDHWCParams<T> params) {
 
   __syncthreads();
 
-  int32_t gj = ci / params.cPerGroup;
   if (cj == params.cPerGroup - THREADS_PER_CHANNEL ||
       threadIdx.x * THREADS_PER_CHANNEL ==
           params.cPerBlock - THREADS_PER_CHANNEL) {
@@ -351,7 +417,15 @@ inline __device__ void GroupNormCompute(int32_t dhwBegin,
   for (int32_t dhwi = dhwBegin; dhwi < dhwEnd; ++dhwi) {
     // The src/dst offset.
     int64_t offset = (int64_t)blockIdx.z * params.dhwc + dhwi * params.c + ci;
-    const float src_data = phi::__2float<T>(params.srcX[offset]);
+    float src_data = phi::__2float<T>(params.srcX[offset]);
+    if (params.srcR != nullptr) {
+      auto gi = ci / params.cPerGroup;
+      auto gj = ci % params.cPerGroup;
+      int64_t g_offset =
+          params.y_same_with_x ? offset : gi * params.cPerGroup + gj;
+      src_data += phi::__2float<T>(params.srcR[g_offset]);
+      *reinterpret_cast<T*>(&params.eleOut[offset]) = phi::__2dst<T>(src_data);
+    }
     // Normalize the channels.
     float dst_data = (src_data - mean) * invStdDev;
     // Scale by gamma and add beta.
@@ -392,6 +466,18 @@ inline __device__ void GroupNormCompute<phi::dtype::float16, 2>(
     // Extract the two half values.
     float2 f2 = __half22float2(h2);
 
+    if (params.srcR != nullptr) {
+      auto gi = ci / params.cPerGroup;
+      auto gj = ci % params.cPerGroup;
+      int64_t g_offset =
+          params.y_same_with_x ? offset : gi * params.cPerGroup + gj;
+      __half2 r2 = *reinterpret_cast<__half2 const*>(&params.srcR[g_offset]);
+      float2 r_f2 = __half22float2(r2);
+      f2.x += r_f2.x;
+      f2.y += r_f2.y;
+      *reinterpret_cast<__half2*>(&params.eleOut[offset]) =
+          __float22half2_rn(f2);
+    }
     // Normalize the channels.
     f2.x = (f2.x - mean) * invStdDev;
     f2.y = (f2.y - mean) * invStdDev;
@@ -434,7 +520,18 @@ inline __device__ void GroupNormCompute<__half, 2>(
 
     // Extract the two half values.
     float2 f2 = __half22float2(h2);
-
+    if (params.srcR != nullptr) {
+      auto gi = ci / params.cPerGroup;
+      auto gj = ci % params.cPerGroup;
+      int64_t g_offset =
+          params.y_same_with_x ? offset : gi * params.cPerGroup + gj;
+      __half2 r2 = *reinterpret_cast<__half2 const*>(&params.srcR[g_offset]);
+      float2 r_f2 = __half22float2(r2);
+      f2.x += r_f2.x;
+      f2.y += r_f2.y;
+      *reinterpret_cast<__half2*>(&params.eleOut[offset]) =
+          __float22half2_rn(f2);
+    }
     // Normalize the channels.
     f2.x = (f2.x - mean) * invStdDev;
     f2.y = (f2.y - mean) * invStdDev;
@@ -480,6 +577,19 @@ inline __device__ void GroupNormCompute<phi::dtype::bfloat16, 2>(
     // Extract the two half values.
     float2 f2 = phi::bfloat1622float2(h2);
 
+    if (params.srcR != nullptr) {
+      auto gi = ci / params.cPerGroup;
+      auto gj = ci % params.cPerGroup;
+      int64_t g_offset =
+          params.y_same_with_x ? offset : gi * params.cPerGroup + gj;
+      __nv_bfloat162 r2 =
+          *reinterpret_cast<__nv_bfloat162 const*>(&params.srcR[g_offset]);
+      float2 r_f2 = phi::bfloat1622float2(r2);
+      f2.x += r_f2.x;
+      f2.y += r_f2.y;
+      *reinterpret_cast<__nv_bfloat162*>(&params.eleOut[offset]) =
+          phi::float22bfloat162_rn(f2);
+    }
     // Normalize the channels.
     f2.x = (f2.x - mean) * invStdDev;
     f2.y = (f2.y - mean) * invStdDev;
@@ -511,6 +621,7 @@ __global__ void groupNormNDHWCScaleKernel(
 
   // The group that thread works on and the channel in the group (modulus).
   int32_t gi = ci / params.cPerGroup;
+  int32_t gj = ci % params.cPerGroup;
 
   if (ci >= params.c || gi >= params.groups) {
     return;
@@ -597,17 +708,24 @@ template class groupNormNDHWCScale<half>;
 template <typename T, typename Context>
 void GroupNormNDHWCKernel(const Context& dev_ctx,
                           const DenseTensor& x,
+                          const paddle::optional<DenseTensor>& residual,
                           const paddle::optional<DenseTensor>& scale,
                           const paddle::optional<DenseTensor>& bias,
                           float epsilon,
                           int groups,
                           const std::string& data_layout_str,
+                          const std::string& activation,
                           DenseTensor* y,
+                          DenseTensor* residual_out,
                           DenseTensor* mean,
                           DenseTensor* var) {
+  const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
+  if (data_layout != DataLayout::kNHWC) {
+    PD_THROW("data_layout only supports NHWC and NDHWC");
+  }
   using AccT = typename phi::dtype::MPTypeTrait<T>::Type;
   GroupNormNDHWCParams<T> params_;
-  params_.withSilu = false;
+  params_.withSilu = activation == "silu" ? true : false;
 
   const auto x_dims = x.dims();
   dev_ctx.template Alloc<T>(y);
@@ -639,6 +757,23 @@ void GroupNormNDHWCKernel(const Context& dev_ctx,
     params_.w = x_dims[3];
   }
 
+  const T* residual_data = nullptr;
+  const auto residual_ptr = residual.get_ptr();
+  T* residual_out_data = nullptr;
+  if (residual_ptr) {
+    dev_ctx.template Alloc<T>(residual_out);
+    residual_data = residual_ptr->data<T>();
+    residual_out_data = residual_out->data<T>();
+    const auto r_dims = residual_ptr->dims();
+    int32_t r_dim = 1;
+    for (size_t i = 0; i < r_dims.size(); i++) {
+      r_dim *= r_dims[i];
+    }
+    params_.y_same_with_x =
+        r_dim == params_.n * params_.c * params_.d * params_.h * params_.w
+            ? true
+            : false;
+  }
   dev_ctx.template Alloc<AccT>(mean);
   dev_ctx.template Alloc<AccT>(var);
   auto* mean_data = mean->data<AccT>();
@@ -673,7 +808,10 @@ void GroupNormNDHWCKernel(const Context& dev_ctx,
   }
   params_.srcX = reinterpret_cast<const T*>(x_data);
   params_.dst = reinterpret_cast<T*>(y_data);
-
+  if (residual_ptr) {
+    params_.srcR = reinterpret_cast<const T*>(residual_data);
+    params_.eleOut = reinterpret_cast<T*>(residual_out_data);
+  }
   params_.gamma = scale_data;
   params_.beta = bias_data;
   params_.dhw = params_.d * params_.h * params_.w;
@@ -1027,14 +1165,19 @@ void GroupNormKernel(const Context& dev_ctx,
                      DenseTensor* var) {
   using std::is_same;
   if (is_same<T, phi::dtype::float16>::value && data_layout_str == "NHWC") {
+    const paddle::optional<DenseTensor>& residual =
+        paddle::optional<DenseTensor>(paddle::none);
     GroupNormNDHWCKernel<phi::dtype::float16, Context>(dev_ctx,
                                                        x,
+                                                       residual,
                                                        scale,
                                                        bias,
                                                        epsilon,
                                                        groups,
                                                        data_layout_str,
+                                                       "",
                                                        y,
+                                                       new DenseTensor(),
                                                        mean,
                                                        var);
     return;
@@ -1042,14 +1185,19 @@ void GroupNormKernel(const Context& dev_ctx,
 
 #ifdef PADDLE_CUDA_BF16
   if (is_same<T, phi::dtype::bfloat16>::value && data_layout_str == "NHWC") {
+    const paddle::optional<DenseTensor>& residual =
+        paddle::optional<DenseTensor>(paddle::none);
     GroupNormNDHWCKernel<phi::dtype::bfloat16, Context>(dev_ctx,
                                                         x,
+                                                        residual,
                                                         scale,
                                                         bias,
                                                         epsilon,
                                                         groups,
                                                         data_layout_str,
+                                                        "",
                                                         y,
+                                                        new DenseTensor(),
                                                         mean,
                                                         var);
     return;
@@ -1076,3 +1224,13 @@ PD_REGISTER_KERNEL(group_norm,
     kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
   }
 }
+
+PD_REGISTER_KERNEL(add_group_norm_silu,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GroupNormNDHWCKernel,
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16) {
+  kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
+}
diff --git a/paddle/phi/kernels/gpu/inverse_grad_kernel.cu b/paddle/phi/kernels/gpu/inverse_grad_kernel.cu
index 2fdc02934fedc..15c24719adfc3 100644
--- a/paddle/phi/kernels/gpu/inverse_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/inverse_grad_kernel.cu
@@ -18,5 +18,11 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/inverse_grad_kernel_impl.h"
 
-PD_REGISTER_KERNEL(
-    inverse_grad, GPU, ALL_LAYOUT, phi::InverseGradKernel, float, double) {}
+PD_REGISTER_KERNEL(inverse_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::InverseGradKernel,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/inverse_kernel.cu b/paddle/phi/kernels/gpu/inverse_kernel.cu
index 4c011337c6f8f..a9b4fcc763b0b 100644
--- a/paddle/phi/kernels/gpu/inverse_kernel.cu
+++ b/paddle/phi/kernels/gpu/inverse_kernel.cu
@@ -18,5 +18,11 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/inverse_kernel_impl.h"
 
-PD_REGISTER_KERNEL(
-    inverse, GPU, ALL_LAYOUT, phi::InverseKernel, float, double) {}
+PD_REGISTER_KERNEL(inverse,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::InverseKernel,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/fluid/operators/ops_signature/channel_shuffle_sig.cc b/paddle/phi/kernels/gpu/lstm_grad_kernel.cu
similarity index 50%
rename from paddle/fluid/operators/ops_signature/channel_shuffle_sig.cc
rename to paddle/phi/kernels/gpu/lstm_grad_kernel.cu
index d3bf58bdec3c8..5590541dcb385 100644
--- a/paddle/fluid/operators/ops_signature/channel_shuffle_sig.cc
+++ b/paddle/phi/kernels/gpu/lstm_grad_kernel.cu
@@ -1,4 +1,4 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,19 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/core/compat/op_utils.h"
+#include "paddle/phi/kernels/impl/lstm_kernel_impl.h"
+#include "paddle/phi/kernels/lstm_kernel.h"
 
-namespace phi {
-
-KernelSignature ChannelShuffleGradOpArgumentMapping(
-    const ArgumentMappingContext& ctx UNUSED) {
-  return KernelSignature("channel_shuffle_grad",
-                         {"Out@GRAD"},
-                         {"groups", "data_format"},
-                         {"X@GRAD"});
-}
-
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(channel_shuffle_grad,
-                           phi::ChannelShuffleGradOpArgumentMapping);
+PD_REGISTER_KERNEL(
+    lstm_grad, GPU, ALL_LAYOUT, phi::LSTMGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/lstm_kernel.cu b/paddle/phi/kernels/gpu/lstm_kernel.cu
new file mode 100644
index 0000000000000..7bcf1f78ab604
--- /dev/null
+++ b/paddle/phi/kernels/gpu/lstm_kernel.cu
@@ -0,0 +1,18 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/lstm_kernel.h"
+#include "paddle/phi/kernels/impl/lstm_kernel_impl.h"
+
+PD_REGISTER_KERNEL(lstm, GPU, ALL_LAYOUT, phi::LSTMKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/meshgrid_grad_kernel.cu.cc b/paddle/phi/kernels/gpu/meshgrid_grad_kernel.cu.cc
index 2dd9e7dc6ceec..3244f28c77700 100644
--- a/paddle/phi/kernels/gpu/meshgrid_grad_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/meshgrid_grad_kernel.cu.cc
@@ -27,4 +27,6 @@ PD_REGISTER_KERNEL(meshgrid_grad,
                    double,
                    int,
                    int64_t,
-                   phi::dtype::bfloat16) {}
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/meshgrid_kernel.cu.cc b/paddle/phi/kernels/gpu/meshgrid_kernel.cu.cc
index 5a1c74f4193d3..9176305d94fec 100644
--- a/paddle/phi/kernels/gpu/meshgrid_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/meshgrid_kernel.cu.cc
@@ -27,4 +27,6 @@ PD_REGISTER_KERNEL(meshgrid,
                    double,
                    int,
                    int64_t,
-                   phi::dtype::bfloat16) {}
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/pool_grad_kernel.cu b/paddle/phi/kernels/gpu/pool_grad_kernel.cu
index 450bfc07a7b46..59afcdfe9884f 100644
--- a/paddle/phi/kernels/gpu/pool_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/pool_grad_kernel.cu
@@ -27,6 +27,14 @@ PD_REGISTER_KERNEL(pool2d_grad,
                    double,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
+PD_REGISTER_KERNEL(lp_pool2d_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LPPool2dGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(pool2d_double_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/pool_kernel.cu b/paddle/phi/kernels/gpu/pool_kernel.cu
index 33abba0a51a50..b9ab97da86fe1 100644
--- a/paddle/phi/kernels/gpu/pool_kernel.cu
+++ b/paddle/phi/kernels/gpu/pool_kernel.cu
@@ -27,6 +27,14 @@ PD_REGISTER_KERNEL(pool2d,
                    double,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
+PD_REGISTER_KERNEL(lp_pool2d,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LPPool2dKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(max_pool2d_with_index,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu b/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu
index 24e79c77a50e1..4250ffb76dbe3 100644
--- a/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu
@@ -73,6 +73,7 @@ void PoolGradRawGPUDNNKernel(const Context& ctx,
                                      global_pooling,
                                      adaptive,
                                      padding_algorithm,
+                                     0,
                                      dx);
     return;
   }
diff --git a/paddle/phi/kernels/group_norm_kernel.h b/paddle/phi/kernels/group_norm_kernel.h
index 3dc10df6a1109..7f4b83f065bde 100644
--- a/paddle/phi/kernels/group_norm_kernel.h
+++ b/paddle/phi/kernels/group_norm_kernel.h
@@ -67,6 +67,8 @@ struct GroupNormNDHWCParams {
   T const* srcX;
   // The input buffer. Layout NDHWC.
   T const* srcY;
+  // The input buffer. Layout NDHWC.
+  T const* srcR = nullptr;
   // The gamma scaling factor.
   void const* gamma;
   // The beta term to add in GN.
@@ -87,7 +89,8 @@ struct GroupNormNDHWCParams {
   int32_t groups;
   // Do we apply the Silu activation function?
   bool withSilu;
-
+  //
+  bool y_same_with_x = false;
   // Precomputed values and parameters to control the execution of the kernels.
 
   // The number of activations per instance (d * h * w) and the number of
diff --git a/paddle/phi/kernels/impl/fft_grad_kernel_impl.h b/paddle/phi/kernels/impl/fft_grad_kernel_impl.h
index 72c8bc659a632..debc4ad1b6db6 100644
--- a/paddle/phi/kernels/impl/fft_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/fft_grad_kernel_impl.h
@@ -92,17 +92,17 @@ void FFTC2RGradKernel(const Context& ctx,
 
   const int64_t double_length =
       out_grad.dims()[axes.back()] - x_grad->dims()[axes.back()];
-  const phi::DDim strides = common::stride(x_grad->dims());
-
-#if defined(__NVCC__) || defined(__HIPCC__)
-  const thrust::device_vector<int64_t> strides_g(common::vectorize(strides));
-  const int64_t* pstrides = thrust::raw_pointer_cast(strides_g.data());
-#else
-  const int64_t* pstrides = strides.Get();
-#endif
-
-  funcs::FFTFillConjGradFunctor<C> func(
-      x_grad->data<C>(), axes.back(), pstrides, double_length);
+  int64_t stride_to_last_axis = 1;
+  auto ddim = x_grad->dims();
+  for (int i = ddim.size() - 2; i >= axes.back(); --i) {
+    stride_to_last_axis *= ddim[i + 1];
+  }
+  int64_t stride_second_to_last_axis = stride_to_last_axis * ddim[axes.back()];
+  funcs::FFTFillConjGradFunctor<C> func(x_grad->data<C>(),
+                                        axes.back(),
+                                        stride_second_to_last_axis,
+                                        stride_to_last_axis,
+                                        double_length);
   size_t limit = x_grad->numel();
   funcs::ForRange<Context> for_range(ctx, limit);
   for_range(func);
diff --git a/paddle/phi/kernels/impl/fused_elemwise_activation_kernel_impl.h b/paddle/phi/kernels/impl/fused_elemwise_activation_kernel_impl.h
new file mode 100644
index 0000000000000..bcaf21cc22df8
--- /dev/null
+++ b/paddle/phi/kernels/impl/fused_elemwise_activation_kernel_impl.h
@@ -0,0 +1,260 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/kernels/funcs/compound_functors.h"
+#include "paddle/phi/kernels/funcs/elementwise/elementwise_op_function.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/funcs/functors.h"
+#include "paddle/phi/kernels/funcs/fused_elemwise_activation_functor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void FusedElemwiseActivationKernel(const Context &dev_ctx,
+                                   const DenseTensor &x,
+                                   const DenseTensor &y,
+                                   const std::vector<std::string> &functor_list,
+                                   int axis,
+                                   float scale,
+                                   bool save_intermediate_out,
+                                   DenseTensor *out,
+                                   DenseTensor *intermediate_out) {
+  auto &in_x = GET_DATA_SAFELY(&x, "Input", "X", "FusedElemwiseActivation");
+  auto &in_y = GET_DATA_SAFELY(&y, "Input", "Y", "FusedElemwiseActivation");
+
+  PADDLE_ENFORCE_EQ(
+      out != nullptr,
+      true,
+      phi::errors::InvalidArgument("The output(Out) should not be empty"));
+  auto output = out;
+
+  std::vector<phi::DenseTensor *> outputs;
+  outputs.emplace_back(output);
+
+  if (save_intermediate_out) {
+    PADDLE_ENFORCE_EQ(intermediate_out != nullptr,
+                      true,
+                      phi::errors::InvalidArgument(
+                          "The save_intermediate_out is enable, so the "
+                          "IntermediateOut should not be empty."));
+
+    outputs.emplace_back(intermediate_out);
+  } else {
+    outputs.emplace_back(nullptr);
+  }
+
+  phi::funcs::RunFunctors<Context, T>(dev_ctx,
+                                      in_x,
+                                      in_y,
+                                      &outputs,
+                                      functor_list,
+                                      scale,
+                                      axis,
+                                      save_intermediate_out);
+}
+
+template <typename T, typename Context>
+void FusedElemwiseActivationGradKernel(
+    const Context &dev_ctx,
+    const DenseTensor &x,
+    const DenseTensor &y,
+    const DenseTensor &out,
+    const DenseTensor &intermediate_out,
+    const DenseTensor &out_grad,
+    const std::vector<std::string> &functor_list,
+    int axis,
+    float scale,
+    bool save_intermediate_out,
+    DenseTensor *x_grad,
+    DenseTensor *y_grad) {
+  auto *in_y = &y;
+  PADDLE_ENFORCE_NE(
+      in_y,
+      nullptr,
+      phi::errors::InvalidArgument("Input(Y) should not be nullptr."));
+  phi::DenseTensor *in_out = const_cast<phi::DenseTensor *>(&out);
+
+  auto in_out_grad = &out_grad;
+  PADDLE_ENFORCE_NE(
+      in_out_grad,
+      nullptr,
+      phi::errors::InvalidArgument("Input(Out@Grad) should not be nullptr."));
+
+  std::vector<std::string> functor_list_new = functor_list;
+  size_t sz = functor_list_new[0].size();
+  int start = sz < 5 ? 0 : (sz - 5);
+  if (functor_list_new[0].substr(start, 5) != "_grad") {
+    functor_list_new[0] += "_grad";
+  }
+  sz = functor_list_new[1].size();
+  start = sz < 5 ? 0 : (sz - 5);
+  if (functor_list_new[1].substr(start, 5) != "_grad") {
+    functor_list_new[1] += "_grad";
+  }
+
+  phi::DenseTensor *in_x = const_cast<phi::DenseTensor *>(&x);
+  phi::DenseTensor *d_intermediate_out =
+      nullptr;  // intermediate_out_grad  is not supported in ops.yaml, so use
+                // nullptr
+
+  // Get intermediate_out
+  phi::DenseTensor *in_intermediate_out = nullptr;
+  if (save_intermediate_out) {
+    // if save_intermediate_out is true, for Unary(Binary(x, y)) and
+    // Binary(x, Unary(y)), the Binary(x, y) and Unary(y) not need to
+    // recompute.
+    in_intermediate_out = const_cast<phi::DenseTensor *>(&intermediate_out);
+    PADDLE_ENFORCE_NE(in_intermediate_out,
+                      nullptr,
+                      phi::errors::InvalidArgument(
+                          "The option of 'save_intermediate_out' is opened,"
+                          " so the number of 'Out' should be two."));
+  } else {
+    if (!phi::funcs::InputXCanBeAbsent(functor_list_new)) {
+      PADDLE_ENFORCE_NE(
+          in_x,
+          nullptr,
+          phi::errors::InvalidArgument("Input(X) should not be null."));
+    }
+  }
+
+  // Get in_x
+  if (x.initialized()) {
+    PADDLE_ENFORCE_NE(
+        in_x,
+        nullptr,
+        phi::errors::InvalidArgument("Input(X) should not be null."));
+  } else {
+    // If functor_list contains elementwise_add, the backward doesn't use
+    // in_x, in_y and in_out.
+    PADDLE_ENFORCE_EQ(phi::funcs::InputXCanBeAbsent(functor_list_new),
+                      true,
+                      phi::errors::InvalidArgument(
+                          "Only when the compoundfunctor contains "
+                          "elementwise_add_grad, the 'X' could be absent."));
+    in_x = const_cast<phi::DenseTensor *>(in_out_grad);
+  }
+
+  // Get in_Out
+  if (out.initialized()) {
+    PADDLE_ENFORCE_NE(
+        in_out,
+        nullptr,
+        phi::errors::InvalidArgument("Input(X) should not be null."));
+  } else {
+    // If functor_list contains elementwise_add, the backward doesn't use
+    // in_x, in_y and in_out.
+    PADDLE_ENFORCE_EQ(phi::funcs::InputXCanBeAbsent(functor_list_new),
+                      true,
+                      phi::errors::InvalidArgument(
+                          "Only when the compoundfunctor contains "
+                          "elementwise_add_grad, the 'X' could be absent."));
+    in_out = const_cast<phi::DenseTensor *>(in_out_grad);
+  }
+
+  bool has_in_place = phi::funcs::HasInPlaceUnary(functor_list_new);
+  if (has_in_place) {
+    phi::funcs::RunGradFunctors<Context, T, true /*InPlace*/>(
+        dev_ctx,
+        in_x,
+        in_y,
+        in_out,
+        in_intermediate_out,
+        in_out_grad,
+        x_grad,
+        y_grad,
+        d_intermediate_out,
+        functor_list_new,
+        scale,
+        axis);
+  } else {
+    phi::funcs::RunGradFunctors<Context, T, false /*InPlace*/>(
+        dev_ctx,
+        in_x,
+        in_y,
+        in_out,
+        in_intermediate_out,
+        in_out_grad,
+        x_grad,
+        y_grad,
+        d_intermediate_out,
+        functor_list_new,
+        scale,
+        axis);
+  }
+}
+
+template <typename T, typename Context>
+void FusedElemwiseAddActivationKernel(
+    const Context &dev_ctx,
+    const DenseTensor &x,
+    const DenseTensor &y,
+    const std::vector<std::string> &functor_list,
+    int axis,
+    float scale,
+    bool save_intermediate_out,
+    DenseTensor *out,
+    DenseTensor *intermediate_out) {
+  FusedElemwiseActivationKernel<T, Context>(dev_ctx,
+                                            x,
+                                            y,
+                                            functor_list,
+                                            axis,
+                                            scale,
+                                            save_intermediate_out,
+                                            out,
+                                            intermediate_out);
+}
+
+template <typename T, typename Context>
+void FusedElemwiseAddActivationGradKernel(
+    const Context &dev_ctx,
+    const paddle::optional<DenseTensor> &x,
+    const DenseTensor &y,
+    const DenseTensor &out,
+    const paddle::optional<DenseTensor> &intermediate_out,
+    const DenseTensor &out_grad,
+    const std::vector<std::string> &functor_list,
+    int axis,
+    float scale,
+    bool save_intermediate_out,
+    DenseTensor *x_grad,
+    DenseTensor *y_grad) {
+  phi::DenseTensor tmp_x;
+  phi::DenseTensor tmp_i;
+  if (x) {
+    tmp_x = x.get();
+  }
+  if (intermediate_out) {
+    tmp_i = intermediate_out.get();
+  }
+  FusedElemwiseActivationGradKernel<T, Context>(dev_ctx,
+                                                tmp_x,
+                                                y,
+                                                out,
+                                                tmp_i,
+                                                out_grad,
+                                                functor_list,
+                                                axis,
+                                                scale,
+                                                save_intermediate_out,
+                                                x_grad,
+                                                y_grad);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/inverse_grad_kernel_impl.h b/paddle/phi/kernels/impl/inverse_grad_kernel_impl.h
index 26e2898bf73ff..aa23bddb5b979 100644
--- a/paddle/phi/kernels/impl/inverse_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/inverse_grad_kernel_impl.h
@@ -18,6 +18,7 @@
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/kernels/complex_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/matrix_inverse.h"
 
@@ -37,15 +38,35 @@ void InverseGradKernel(const Context& dev_ctx,
     tmp_out.Resize(out.dims());
     dev_ctx.template Alloc<T>(&tmp_out);
 
-    auto mat_dim_a0 =
-        phi::funcs::CreateMatrixDescriptor(out_grad.dims(), 0, false);
-    auto mat_dim_b0 = phi::funcs::CreateMatrixDescriptor(out.dims(), 0, true);
-    blas.MatMul(out_grad, mat_dim_a0, out, mat_dim_b0, T(1), &tmp_out, T(0));
+    if (IsComplexType(out.dtype())) {
+      DenseTensor out_conj;
+      out_conj.Resize(out.dims());
+      dev_ctx.template Alloc<T>(&out_conj);
 
-    auto mat_dim_a1 = phi::funcs::CreateMatrixDescriptor(out.dims(), 0, true);
-    auto mat_dim_b1 =
-        phi::funcs::CreateMatrixDescriptor(tmp_out.dims(), 0, false);
-    blas.MatMul(out, mat_dim_a1, tmp_out, mat_dim_b1, T(-1), in_grad, T(0));
+      phi::ConjKernel<T, Context>(dev_ctx, out, &out_conj);
+
+      auto mat_dim_a0 =
+          phi::funcs::CreateMatrixDescriptor(out_grad.dims(), 0, false);
+      auto mat_dim_b0 = phi::funcs::CreateMatrixDescriptor(out.dims(), 0, true);
+      blas.MatMul(
+          out_grad, mat_dim_a0, out_conj, mat_dim_b0, T(1), &tmp_out, T(0));
+
+      auto mat_dim_a1 = phi::funcs::CreateMatrixDescriptor(out.dims(), 0, true);
+      auto mat_dim_b1 =
+          phi::funcs::CreateMatrixDescriptor(tmp_out.dims(), 0, false);
+      blas.MatMul(
+          out_conj, mat_dim_a1, tmp_out, mat_dim_b1, T(-1), in_grad, T(0));
+    } else {
+      auto mat_dim_a0 =
+          phi::funcs::CreateMatrixDescriptor(out_grad.dims(), 0, false);
+      auto mat_dim_b0 = phi::funcs::CreateMatrixDescriptor(out.dims(), 0, true);
+      blas.MatMul(out_grad, mat_dim_a0, out, mat_dim_b0, T(1), &tmp_out, T(0));
+
+      auto mat_dim_a1 = phi::funcs::CreateMatrixDescriptor(out.dims(), 0, true);
+      auto mat_dim_b1 =
+          phi::funcs::CreateMatrixDescriptor(tmp_out.dims(), 0, false);
+      blas.MatMul(out, mat_dim_a1, tmp_out, mat_dim_b1, T(-1), in_grad, T(0));
+    }
   }
 }
 
diff --git a/paddle/phi/kernels/impl/lstm_kernel_impl.h b/paddle/phi/kernels/impl/lstm_kernel_impl.h
new file mode 100644
index 0000000000000..1f4b4dcac0f14
--- /dev/null
+++ b/paddle/phi/kernels/impl/lstm_kernel_impl.h
@@ -0,0 +1,443 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/detail/activation_functions.h"
+#include "paddle/phi/kernels/funcs/lstm_compute.h"
+#include "paddle/phi/kernels/funcs/lstm_utils.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LSTMKernel(const Context& dev_ctx,
+                const DenseTensor& input,
+                const paddle::optional<DenseTensor>& h0,
+                const paddle::optional<DenseTensor>& c0,
+                const DenseTensor& weight,
+                const DenseTensor& bias,
+                bool use_peepholes,
+                bool is_reverse,
+                bool is_test,
+                const std::string& gate_activation,
+                const std::string& cell_activation,
+                const std::string& candidate_activation,
+                DenseTensor* hidden,
+                DenseTensor* cell,
+                DenseTensor* batch_gate,
+                DenseTensor* batch_cell_pre_act) {
+  auto* hidden_t0 = h0.get_ptr();
+  auto* cell_t0 = c0.get_ptr();
+
+  phi::DenseTensor* batch_gate_new = nullptr;
+  phi::DenseTensor batch_gate_temp;
+  if (is_test) {
+    batch_gate_new = &batch_gate_temp;
+    batch_gate_new->Resize(input.dims());
+  } else {
+    batch_gate_new = batch_gate;
+  }
+
+  dev_ctx.template Alloc<T>(batch_gate_new);
+  dev_ctx.template Alloc<T>(hidden);
+  dev_ctx.template Alloc<T>(cell);
+
+  phi::funcs::LoDTensor2BatchFunctor<Context, T> to_batch;
+  to_batch(dev_ctx, input, batch_gate_new, true, is_reverse);
+
+  auto in_dims = input.dims();
+  int frame_size = static_cast<int>(in_dims[1] / 4);
+  phi::DDim dims({in_dims[0], frame_size});
+
+  if (bias.initialized()) {
+    phi::DenseTensor b = bias;
+    b.Resize({bias.numel(), 1});
+    phi::DenseTensor gate_bias = b.Slice(0, 4 * frame_size);
+    phi::funcs::RowwiseAdd<Context, T> add_bias;
+    add_bias(dev_ctx, *batch_gate_new, gate_bias, batch_gate_new);
+  }
+
+  phi::funcs::LstmMetaValue<T> lstm_value;
+  if (bias.initialized() && use_peepholes) {
+    T* bias_data = const_cast<T*>(bias.data<T>());
+    // the code style in LstmMetaValue will be updated later.
+
+    lstm_value.check_ig = bias_data + 4 * frame_size;
+    lstm_value.check_fg = lstm_value.check_ig + frame_size;
+    lstm_value.check_og = lstm_value.check_fg + frame_size;
+  } else {
+    lstm_value.check_ig = nullptr;
+    lstm_value.check_fg = nullptr;
+    lstm_value.check_og = nullptr;
+  }
+  lstm_value.prev_state_value = nullptr;
+  phi::DenseTensor ordered_c0;
+
+  phi::Vector<size_t> order(batch_gate_new->lod()[2]);
+
+  if (cell_t0) {
+    // Since the batch computing for LSTM reorders the input sequence
+    // according to their length. The initialized cell state also needs
+    // to reorder.
+    ReorderInitState<Context, T>(dev_ctx, *cell_t0, order, &ordered_c0, true);
+    lstm_value.prev_state_value = ordered_c0.data<T>();
+  }
+
+  // Use the local variable as here.
+  phi::DenseTensor batch_hidden, batch_cell, batch_cell_pre_act_temp;
+  phi::DenseTensor* batch_cell_pre_act_p;
+  if (is_test) {
+    batch_cell_pre_act_p = &batch_cell_pre_act_temp;
+  } else {
+    batch_cell_pre_act_p = batch_cell_pre_act;
+  }
+  batch_hidden.Resize(dims);
+  batch_cell.Resize(dims);
+  dev_ctx.template Alloc<T>(&batch_hidden);
+  dev_ctx.template Alloc<T>(&batch_cell);
+  batch_cell_pre_act_p->Resize(dims);
+  dev_ctx.template Alloc<T>(batch_cell_pre_act_p);
+
+  auto batch_starts = batch_gate_new->lod()[0];
+  size_t num_batch = batch_starts.size() - 1;
+  auto gate_act = phi::funcs::detail::GetActivationType(gate_activation);
+  auto cell_act = phi::funcs::detail::GetActivationType(cell_activation);
+  auto cand_act = phi::funcs::detail::GetActivationType(candidate_activation);
+
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+  for (size_t n = 0; n < num_batch; n++) {
+    int bstart = static_cast<int>(batch_starts[n]);
+    int bend = static_cast<int>(batch_starts[n + 1]);
+
+    phi::DenseTensor gate_t = batch_gate_new->Slice(bstart, bend);
+    phi::DenseTensor out_t = batch_hidden.Slice(bstart, bend);
+    phi::DenseTensor cell_t = batch_cell.Slice(bstart, bend);
+    phi::DenseTensor cell_pre_act_t = batch_cell_pre_act_p->Slice(bstart, bend);
+
+    int cur_batch_size = bend - bstart;
+
+    if (n > 0) {
+      int pre_h_start = static_cast<int>(batch_starts[n - 1]);
+      int pre_h_end = pre_h_start + cur_batch_size;
+      auto pre_hidden_t = batch_hidden.Slice(pre_h_start, pre_h_end);
+      blas.MatMul(pre_hidden_t,
+                  false,
+                  weight,
+                  false,
+                  static_cast<T>(1.0),
+                  &gate_t,
+                  static_cast<T>(1.0));
+    } else if (hidden_t0 != nullptr) {
+      // If n == 0 and there is no initialized hidden state, that is to say
+      // the H0 is zeros, the calculation W_h * H0 will be skiped.
+      // If n == 0 and there is initialized hidden state, calculate W_h * H0.
+
+      // Since the batch computing for LSTM reorders the input sequence
+      // according to their length. The initialized hidden state also needs
+      // to reorder.
+      phi::DenseTensor ordered_h0;
+      ReorderInitState<Context, T>(
+          dev_ctx, *hidden_t0, order, &ordered_h0, true);
+      blas.MatMul(ordered_h0,
+                  false,
+                  weight,
+                  false,
+                  static_cast<T>(1.0),
+                  &gate_t,
+                  static_cast<T>(1.0));
+    }
+
+    lstm_value.gate_value = gate_t.data<T>();
+    lstm_value.output_value = out_t.data<T>();
+    lstm_value.state_value = cell_t.data<T>();
+    lstm_value.state_active_value = cell_pre_act_t.data<T>();
+    T cell_clip = 0.0;
+    phi::funcs::LstmUnitFunctor<Context, T>::compute(dev_ctx,
+                                                     lstm_value,
+                                                     frame_size,
+                                                     cur_batch_size,
+                                                     cell_clip,
+                                                     gate_act,
+                                                     cell_act,
+                                                     cand_act);
+    lstm_value.prev_state_value = lstm_value.state_value;
+  }
+
+  phi::funcs::Batch2LoDTensorFunctor<Context, T> to_seq;
+  batch_hidden.set_lod(batch_gate_new->lod());
+  // restore the output hidden in phi::DenseTensor from the batch hidden
+  to_seq(dev_ctx, batch_hidden, hidden);
+
+  batch_cell.set_lod(batch_gate_new->lod());
+  // restore the output cell state in phi::DenseTensor from the batch cell
+  to_seq(dev_ctx, batch_cell, cell);
+}
+
+template <typename T, typename Context>
+void LSTMGradKernel(const Context& dev_ctx,
+                    const DenseTensor& input_in,
+                    const paddle::optional<DenseTensor>& h0_in,
+                    const paddle::optional<DenseTensor>& c0_in,
+                    const DenseTensor& weight_in,
+                    const DenseTensor& bias_in,
+                    const DenseTensor& hidden_in,
+                    const DenseTensor& cell_in,
+                    const DenseTensor& batch_gate_in,
+                    const DenseTensor& batch_cell_pre_act_in,
+                    const DenseTensor& hidden_grad,
+                    bool use_peepholes,
+                    bool is_reverse,
+                    bool is_test,
+                    const std::string& gate_activation,
+                    const std::string& cell_activation,
+                    const std::string& candidate_activation,
+                    DenseTensor* input_grad,
+                    DenseTensor* h0_grad,
+                    DenseTensor* c0_grad,
+                    DenseTensor* weight_grad,
+                    DenseTensor* bias_grad) {
+  auto* input = &input_in;
+  auto* weight = &weight_in;
+  auto* bias = &bias_in;
+
+  auto* hidden_out = &hidden_in;
+  auto* cell_out = &cell_in;
+
+  auto* batch_gate = &batch_gate_in;
+  auto* batch_cell_pre_act = &batch_cell_pre_act_in;
+
+  auto* hidden_g = &hidden_grad;
+
+  auto* in_g = input_grad;
+  auto* weight_g = weight_grad;
+  auto* bias_g = bias_grad;
+
+  auto* h0 = h0_in.get_ptr();
+  auto* c0 = c0_in.get_ptr();
+
+  auto* h0_g = h0_grad;
+  auto* c0_g = c0_grad;
+
+  phi::funcs::SetConstant<Context, T> zero;
+  if (weight_g) {
+    dev_ctx.template Alloc<T>(weight_g);
+    zero(dev_ctx, weight_g, static_cast<T>(0.0));
+  }
+
+  // ordered_h0/c0 is the reordered hidden/cell initialization.
+  // ordered_h0_g/c0_g is the reordered gradient of hidden/cell
+  // initialization.
+  phi::DenseTensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g;
+  phi::Vector<size_t> order(batch_gate->lod()[2]);
+
+  if (c0) {
+    ReorderInitState<Context, T>(dev_ctx, *c0, order, &ordered_c0, true);
+  }
+  if (c0 && c0_g) {
+    ordered_c0_g.Resize(c0_g->dims());
+    dev_ctx.template Alloc<T>(&ordered_c0_g);
+  }
+
+  auto in_dims = input->dims();
+  auto out_dims = hidden_g->dims();
+  int frame_size = static_cast<int>(in_dims[1] / 4);
+  PADDLE_ENFORCE_EQ(frame_size,
+                    out_dims[1],
+                    phi::errors::InvalidArgument(
+                        "The second dimension of Input(hidden_grad) should be "
+                        "%d, but received %d in LSTM@Grad operator.",
+                        frame_size,
+                        out_dims[1]));
+
+  phi::funcs::LstmMetaValue<T> lstm_value;
+  if (bias && use_peepholes) {
+    T* bias_data = const_cast<T*>(bias->data<T>());
+    lstm_value.check_ig = bias_data + 4 * frame_size;
+    lstm_value.check_fg = lstm_value.check_ig + frame_size;
+    lstm_value.check_og = lstm_value.check_fg + frame_size;
+  } else {
+    lstm_value.check_ig = nullptr;
+    lstm_value.check_fg = nullptr;
+    lstm_value.check_og = nullptr;
+  }
+
+  phi::funcs::LstmMetaGrad<T> lstm_grad;
+
+  if (bias && bias_g) {
+    dev_ctx.template Alloc<T>(bias_g);
+    zero(dev_ctx, bias_g, static_cast<T>(0.0));
+  }
+  if (bias && bias_g && use_peepholes) {
+    T* bias_g_data = bias_g->data<T>();
+    lstm_grad.check_ig_grad = bias_g_data + 4 * frame_size;
+    lstm_grad.check_fg_grad = lstm_grad.check_ig_grad + frame_size;
+    lstm_grad.check_og_grad = lstm_grad.check_fg_grad + frame_size;
+  } else {
+    lstm_grad.check_ig_grad = nullptr;
+    lstm_grad.check_fg_grad = nullptr;
+    lstm_grad.check_og_grad = nullptr;
+  }
+
+  phi::funcs::LoDTensor2BatchFunctor<Context, T> to_batch;
+
+  auto ToBatch = [&batch_gate, &to_batch](const Context& ctx,
+                                          const phi::DenseTensor& src,
+                                          const phi::DDim& dims,
+                                          phi::DenseTensor& dst) {
+    dst.Resize(dims);
+    ctx.template Alloc<T>(&dst);
+    dst.set_lod(batch_gate->lod());
+    to_batch(ctx, src, &dst, false);
+  };
+
+  phi::DenseTensor batch_hidden, batch_hidden_g, batch_cell;
+  ToBatch(dev_ctx, *hidden_out, out_dims, batch_hidden);
+  ToBatch(dev_ctx, *hidden_g, out_dims, batch_hidden_g);
+  ToBatch(dev_ctx, *cell_out, out_dims, batch_cell);
+
+  phi::DenseTensor batch_cell_g, batch_gate_g;
+  batch_cell_g.Resize(out_dims);
+  dev_ctx.template Alloc<T>(&batch_cell_g);
+  // TODO(qingqing) support the case output cell has gradient.
+  // to_batch(dev_ctx, *cell_g, batch_cell_g, false);
+  zero(dev_ctx, &batch_cell_g, static_cast<T>(0.0));
+  batch_gate_g.Resize(batch_gate->dims());
+  dev_ctx.template Alloc<T>(&batch_gate_g);
+  batch_gate_g.set_lod(batch_gate->lod());
+
+  auto gate_act = phi::funcs::detail::GetActivationType(gate_activation);
+  auto cell_act = phi::funcs::detail::GetActivationType(cell_activation);
+  auto cand_act = phi::funcs::detail::GetActivationType(candidate_activation);
+
+  auto batch_starts = batch_gate->lod()[0];
+  size_t num_batch = batch_starts.size() - 1;
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+  for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) {
+    int bstart = static_cast<int>(batch_starts[n]);
+    int bend = static_cast<int>(batch_starts[n + 1]);
+
+    phi::DenseTensor gate = batch_gate->Slice(bstart, bend);
+    phi::DenseTensor cell = batch_cell.Slice(bstart, bend);
+    phi::DenseTensor cell_pre_act = batch_cell_pre_act->Slice(bstart, bend);
+    lstm_value.gate_value = gate.data<T>();
+    lstm_value.state_value = cell.data<T>();
+    lstm_value.state_active_value = cell_pre_act.data<T>();
+
+    phi::DenseTensor out_g = batch_hidden_g.Slice(bstart, bend);
+    phi::DenseTensor gate_g = batch_gate_g.Slice(bstart, bend);
+    phi::DenseTensor cell_g = batch_cell_g.Slice(bstart, bend);
+    lstm_grad.state_grad = cell_g.data<T>();
+    lstm_grad.gate_grad = gate_g.data<T>();
+    lstm_grad.output_grad = out_g.data<T>();
+
+    if (n > 0) {
+      int bstart_pre = static_cast<int>(batch_starts[n - 1]);
+      phi::DenseTensor cell_pre = batch_cell.Slice(bstart_pre, bstart);
+      phi::DenseTensor cell_pre_g = batch_cell_g.Slice(bstart_pre, bstart);
+      lstm_value.prev_state_value = cell_pre.data<T>();
+      lstm_grad.prev_state_grad = cell_pre_g.data<T>();
+    } else {
+      lstm_value.prev_state_value = c0 ? ordered_c0.data<T>() : nullptr;
+      lstm_grad.prev_state_grad = c0_g ? ordered_c0_g.data<T>() : nullptr;
+    }
+
+    // lstm_value.output_value not used in bp, set to nullptr
+    // lstm_grad.state_active_grad not used in bp, set to nullptr
+    lstm_value.output_value = nullptr;
+    lstm_grad.state_active_grad = nullptr;
+    int cur_batch_size = bend - bstart;
+    T cell_clip = 0.0;
+    phi::funcs::LstmUnitGradFunctor<Context, T>::compute(dev_ctx,
+                                                         lstm_value,
+                                                         lstm_grad,
+                                                         frame_size,
+                                                         cur_batch_size,
+                                                         cell_clip,
+                                                         gate_act,
+                                                         cell_act,
+                                                         cand_act);
+
+    if (n > 0) {
+      int pre_h_start = static_cast<int>(batch_starts[n - 1]);
+      int pre_h_end = pre_h_start + cur_batch_size;
+      auto pre_hidden_g = batch_hidden_g.Slice(pre_h_start, pre_h_end);
+      blas.MatMul(gate_g,
+                  false,
+                  *weight,
+                  true,
+                  static_cast<T>(1.0),
+                  &pre_hidden_g,
+                  static_cast<T>(1.0));
+      if (weight_g) {
+        /* backward weight */
+        auto pre_hidden = batch_hidden.Slice(pre_h_start, pre_h_end);
+        blas.MatMul(pre_hidden,
+                    true,
+                    gate_g,
+                    false,
+                    static_cast<T>(1.0),
+                    weight_g,
+                    static_cast<T>(1.0));
+      }
+    } else {
+      if (h0 && weight_g) {
+        ReorderInitState<Context, T>(dev_ctx, *h0, order, &ordered_h0, true);
+        blas.MatMul(ordered_h0,
+                    true,
+                    gate_g,
+                    false,
+                    static_cast<T>(1.0),
+                    weight_g,
+                    static_cast<T>(1.0));
+      }
+      if (h0 && h0_g) {
+        ordered_h0_g.Resize(h0_g->dims());
+        dev_ctx.template Alloc<T>(&ordered_h0_g);
+        blas.MatMul(gate_g,
+                    false,
+                    *weight,
+                    true,
+                    static_cast<T>(1.0),
+                    &ordered_h0_g,
+                    static_cast<T>(0.0));
+      }
+    }
+  }
+
+  phi::funcs::Batch2LoDTensorFunctor<Context, T> to_seq;
+  if (in_g) {
+    /* backward data */
+    dev_ctx.template Alloc<T>(in_g);
+    to_seq(dev_ctx, batch_gate_g, in_g);
+  }
+  if (bias && bias_g) {
+    /* backward bias */
+    phi::DenseTensor b_g = *bias_g;
+    b_g.Resize({bias_g->numel(), 1});
+    phi::DenseTensor gate_bias_g = b_g.Slice(0, 4 * frame_size);
+    phi::funcs::ColwiseSum<Context, T> col_sum;
+    col_sum(dev_ctx, batch_gate_g, &gate_bias_g);
+  }
+
+  if (h0 && h0_g) {
+    ReorderInitState<Context, T>(dev_ctx, ordered_h0_g, order, h0_g, false);
+  }
+  if (c0 && c0_g) {
+    ReorderInitState<Context, T>(dev_ctx, ordered_c0_g, order, c0_g, false);
+  }
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/pool_grad_kernel_impl.h b/paddle/phi/kernels/impl/pool_grad_kernel_impl.h
index c8a42e0265fb8..7ed3d65b6410e 100644
--- a/paddle/phi/kernels/impl/pool_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/pool_grad_kernel_impl.h
@@ -36,6 +36,7 @@ void PoolGradRawKernel(const Context& ctx,
                        bool global_pooling,
                        bool adaptive,
                        const std::string& padding_algorithm,
+                       const float norm_type,
                        DenseTensor* dx) {
   const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
   std::vector<int> paddings_ = paddings;
@@ -71,9 +72,15 @@ void PoolGradRawKernel(const Context& ctx,
     funcs::SetConstant<Context, T> set_constant;
     set_constant(ctx, dx, static_cast<T>(0.0));
 
+    std::string true_type;
+    if (norm_type == INFINITY)
+      true_type = "max";
+    else
+      true_type = pooling_type;
+
     switch (kernel_size_.size()) {
       case 2: {
-        if (pooling_type == "max") {
+        if (true_type == "max") {
           funcs::MaxPool2dGradFunctor<Context, T> pool2d_backward;
           pool2d_backward(ctx,
                           x,
@@ -84,7 +91,7 @@ void PoolGradRawKernel(const Context& ctx,
                           paddings_,
                           data_format,
                           dx);
-        } else if (pooling_type == "avg") {
+        } else if (true_type == "avg") {
           funcs::Pool2dGradFunctor<Context, funcs::AvgPoolGrad<T>, T>
               pool2d_backward;
           funcs::AvgPoolGrad<T> pool_process;
@@ -100,6 +107,23 @@ void PoolGradRawKernel(const Context& ctx,
                           adaptive,
                           dx,
                           pool_process);
+        } else {  // lp_pool2d
+          funcs::Pool2dGradFunctor<Context, funcs::LPPoolGrad<T>, T>
+              pool2d_backward;
+          funcs::LPPoolGrad<T> pool_process;
+          pool_process.setNormType(norm_type);
+          pool2d_backward(ctx,
+                          x,
+                          out,
+                          dout,
+                          kernel_size_,
+                          strides,
+                          paddings_,
+                          data_format,
+                          exclusive,
+                          adaptive,
+                          dx,
+                          pool_process);
         }
       } break;
       case 3: {
@@ -215,6 +239,43 @@ void Pool2dGradKernel(const Context& ctx,
                                 global_pooling,
                                 adaptive,
                                 padding_algorithm,
+                                0,
+                                dx);
+}
+
+template <typename T, typename Context>
+void LPPool2dGradKernel(const Context& ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& out,
+                        const DenseTensor& dout,
+                        const IntArray& kernel_size,
+                        const std::vector<int>& strides,
+                        const std::vector<int>& paddings,
+                        bool ceil_mode UNUSED,
+                        bool exclusive,
+                        const std::string& data_format,
+                        const std::string& pooling_type,
+                        bool global_pooling,
+                        bool adaptive,
+                        const std::string& padding_algorithm,
+                        const float norm_type,
+                        DenseTensor* dx) {
+  std::vector<int> kernel_size_val(kernel_size.GetData().begin(),
+                                   kernel_size.GetData().end());
+  PoolGradRawKernel<T, Context>(ctx,
+                                x,
+                                out,
+                                dout,
+                                kernel_size_val,
+                                strides,
+                                paddings,
+                                exclusive,
+                                data_format,
+                                pooling_type,
+                                global_pooling,
+                                adaptive,
+                                padding_algorithm,
+                                norm_type,
                                 dx);
 }
 
@@ -304,6 +365,7 @@ void Pool3dGradKernel(const Context& ctx,
                                 global_pooling,
                                 adaptive,
                                 padding_algorithm,
+                                0,
                                 dx);
 }
 
diff --git a/paddle/phi/kernels/impl/pool_kernel_impl.h b/paddle/phi/kernels/impl/pool_kernel_impl.h
index 50a5195e771e8..2a370c7c876e5 100644
--- a/paddle/phi/kernels/impl/pool_kernel_impl.h
+++ b/paddle/phi/kernels/impl/pool_kernel_impl.h
@@ -61,6 +61,7 @@ void PoolRawKernel(const Context& ctx,
                    bool global_pooling,
                    bool adaptive,
                    const std::string& padding_algorithm,
+                   const float norm_type,
                    DenseTensor* out) {
   const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
   std::vector<int> paddings_ = paddings;
@@ -75,6 +76,15 @@ void PoolRawKernel(const Context& ctx,
     data_dims = slice_ddim(x_dims, 2, x_dims.size());
   }
 
+  std::string true_type;
+  if (norm_type == INFINITY)
+    true_type = "max";
+  else
+    true_type = pooling_type;
+  if (true_type == "lp" && norm_type == 0)
+    PADDLE_THROW(
+        errors::InvalidArgument("norm_type of LPPool op cannot be 0."));
+
   funcs::UpdatePadding(&paddings_,
                        global_pooling,
                        adaptive,
@@ -95,7 +105,7 @@ void PoolRawKernel(const Context& ctx,
 
   switch (kernel_size_.size()) {
     case 2: {
-      if (pooling_type == "max") {
+      if (true_type == "max") {
         funcs::Pool2dFunctor<Context, funcs::MaxPool<T>, T> pool2d_forward;
         funcs::MaxPool<T> pool_process;
         pool2d_forward(ctx,
@@ -109,7 +119,7 @@ void PoolRawKernel(const Context& ctx,
                        out,
                        pool_process);
 
-      } else if (pooling_type == "avg") {
+      } else if (true_type == "avg") {
         std::vector<int> reduce_dim;
         int reduce_num = GetReduceNum(x, out, channel_last, &reduce_dim);
         if (reduce_num > 0 &&
@@ -146,10 +156,24 @@ void PoolRawKernel(const Context& ctx,
                          out,
                          pool_process);
         }
+      } else {  // lp_pool2d
+        funcs::Pool2dFunctor<Context, funcs::LPPool<T>, T> pool2d_forward;
+        funcs::LPPool<T> pool_process;
+        pool_process.setNormType(norm_type);
+        pool2d_forward(ctx,
+                       x,
+                       kernel_size_,
+                       strides,
+                       paddings_,
+                       data_format,
+                       exclusive,
+                       adaptive,
+                       out,
+                       pool_process);
       }
     } break;
     case 3: {
-      if (pooling_type == "max") {
+      if (true_type == "max") {
         funcs::Pool3dFunctor<Context, funcs::MaxPool<T>, T> pool3d_forward;
         funcs::MaxPool<T> pool_process;
         pool3d_forward(ctx,
@@ -162,7 +186,7 @@ void PoolRawKernel(const Context& ctx,
                        false,
                        out,
                        pool_process);
-      } else if (pooling_type == "avg") {
+      } else if (true_type == "avg") {
         funcs::Pool3dFunctor<Context, funcs::AvgPool<T>, T> pool3d_forward;
         funcs::AvgPool<T> pool_process;
         pool3d_forward(ctx,
@@ -175,6 +199,9 @@ void PoolRawKernel(const Context& ctx,
                        adaptive,
                        out,
                        pool_process);
+      } else {  // lp_pool3d
+        PADDLE_THROW(
+            errors::InvalidArgument("LPPool op only supports 2D input."));
       }
     } break;
     default: {
@@ -249,6 +276,39 @@ void Pool2dKernel(const Context& ctx,
                             global_pooling,
                             adaptive,
                             padding_algorithm,
+                            0,
+                            out);
+}
+
+template <typename T, typename Context>
+void LPPool2dKernel(const Context& ctx,
+                    const DenseTensor& x,
+                    const IntArray& kernel_size,
+                    const std::vector<int>& strides,
+                    const std::vector<int>& paddings,
+                    bool ceil_mode UNUSED,
+                    bool exclusive,
+                    const std::string& data_format,
+                    const std::string& pooling_type,
+                    bool global_pooling,
+                    bool adaptive,
+                    const std::string& padding_algorithm,
+                    const float norm_type,
+                    DenseTensor* out) {
+  std::vector<int> kernel_size_val(kernel_size.GetData().begin(),
+                                   kernel_size.GetData().end());
+  PoolRawKernel<T, Context>(ctx,
+                            x,
+                            kernel_size_val,
+                            strides,
+                            paddings,
+                            exclusive,
+                            data_format,
+                            pooling_type,
+                            global_pooling,
+                            adaptive,
+                            padding_algorithm,
+                            norm_type,
                             out);
 }
 
@@ -298,6 +358,7 @@ void Pool3dKernel(const Context& ctx,
                             global_pooling,
                             adaptive,
                             padding_algorithm,
+                            0,
                             out);
 }
 
diff --git a/paddle/phi/kernels/lstm_kernel.h b/paddle/phi/kernels/lstm_kernel.h
new file mode 100644
index 0000000000000..42195e375c3a9
--- /dev/null
+++ b/paddle/phi/kernels/lstm_kernel.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LSTMKernel(const Context& dev_ctx,
+                const DenseTensor& input,
+                const paddle::optional<DenseTensor>& h0,
+                const paddle::optional<DenseTensor>& c0,
+                const DenseTensor& weight,
+                const DenseTensor& bias,
+                bool use_peepholes,
+                bool is_reverse,
+                bool is_test,
+                const std::string& gate_activation,
+                const std::string& cell_activation,
+                const std::string& candidate_activation,
+                DenseTensor* hidden,
+                DenseTensor* cell,
+                DenseTensor* batch_gate,
+                DenseTensor* batch_cell_pre_act);
+
+template <typename T, typename Context>
+void LSTMGradKernel(const Context& dev_ctx,
+                    const DenseTensor& input,
+                    const paddle::optional<DenseTensor>& h0,
+                    const paddle::optional<DenseTensor>& c0,
+                    const DenseTensor& weight,
+                    const DenseTensor& bias,
+                    const DenseTensor& hidden,
+                    const DenseTensor& cell,
+                    const DenseTensor& batch_gate,
+                    const DenseTensor& batch_cell_pre_act,
+                    const DenseTensor& hidden_grad,
+                    bool use_peepholes,
+                    bool is_reverse,
+                    bool is_test,
+                    const std::string& gate_activation,
+                    const std::string& cell_activation,
+                    const std::string& candidate_activation,
+                    DenseTensor* input_grad,
+                    DenseTensor* h0_grad,
+                    DenseTensor* c0_grad,
+                    DenseTensor* weight_grad,
+                    DenseTensor* bias_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/onednn/concat_kernel.cc b/paddle/phi/kernels/onednn/concat_kernel.cc
index c7c258ea88001..725ed9f34cf98 100644
--- a/paddle/phi/kernels/onednn/concat_kernel.cc
+++ b/paddle/phi/kernels/onednn/concat_kernel.cc
@@ -106,7 +106,7 @@ static const std::vector<const DenseTensor*> ReduceMultiInput(
 template <typename T, typename Context>
 void ConcatKernel(const Context& dev_ctx,
                   const std::vector<const DenseTensor*>& x,
-                  const Scalar& axis,
+                  const Scalar& axis_,
                   DenseTensor* out) {
   const auto& onednn_engine = dev_ctx.GetEngine();
   // If any of the multiple inputs of concat has an input size of 0, the
@@ -114,6 +114,9 @@ void ConcatKernel(const Context& dev_ctx,
   auto multi_input = ReduceMultiInput(x);
   EnforceLayouts(multi_input);
 
+  int64_t axis = axis_.to<int64_t>();
+  axis = phi::funcs::ComputeAxis(axis, x[0]->dims().size());
+
   auto out_dims_vec = common::vectorize(out->dims());
   if (std::any_of(out_dims_vec.begin(), out_dims_vec.end(), [](int64_t i) {
         return i < 0;
@@ -125,12 +128,12 @@ void ConcatKernel(const Context& dev_ctx,
     }
 
     DDim out_dims =
-        funcs::ComputeAndCheckShape(true, x_dims, axis.to<size_t>());
+        funcs::ComputeAndCheckShape(true, x_dims, static_cast<size_t>(axis));
     out->Resize(out_dims);
   }
 
   funcs::ConcatOneDNNHandler<T> handler(
-      dev_ctx.GetPlace(), axis.to<int>(), onednn_engine, multi_input, out);
+      dev_ctx.GetPlace(), axis, onednn_engine, multi_input, out);
 
   std::vector<std::shared_ptr<memory>> srcs;
   srcs.reserve(multi_input.size());
diff --git a/paddle/phi/kernels/pool_grad_kernel.h b/paddle/phi/kernels/pool_grad_kernel.h
index 2f813aa9dc050..d027a97b42f68 100644
--- a/paddle/phi/kernels/pool_grad_kernel.h
+++ b/paddle/phi/kernels/pool_grad_kernel.h
@@ -39,6 +39,24 @@ void Pool2dGradKernel(const Context& ctx,
                       const std::string& padding_algorithm,
                       DenseTensor* dx);
 
+template <typename T, typename Context>
+void LPPool2dGradKernel(const Context& ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& out,
+                        const DenseTensor& dout,
+                        const IntArray& kernel_size,
+                        const std::vector<int>& strides,
+                        const std::vector<int>& paddings,
+                        bool ceil_mode,
+                        bool exclusive,
+                        const std::string& data_format,
+                        const std::string& pooling_type,
+                        bool global_pooling,
+                        bool adaptive,
+                        const std::string& padding_algorithm,
+                        const float norm_type,
+                        DenseTensor* dx);
+
 template <typename T, typename Context>
 void Pool2dGradGPUDNNKernel(const Context& ctx,
                             const DenseTensor& x,
diff --git a/paddle/phi/kernels/pool_kernel.h b/paddle/phi/kernels/pool_kernel.h
index e958d62d8c225..28e65d837818f 100644
--- a/paddle/phi/kernels/pool_kernel.h
+++ b/paddle/phi/kernels/pool_kernel.h
@@ -37,6 +37,22 @@ void Pool2dKernel(const Context& ctx,
                   const std::string& padding_algorithm,
                   DenseTensor* out);
 
+template <typename T, typename Context>
+void LPPool2dKernel(const Context& ctx,
+                    const DenseTensor& x,
+                    const IntArray& kernel_size,
+                    const std::vector<int>& strides,
+                    const std::vector<int>& paddings,
+                    bool ceil_mode,
+                    bool exclusive,
+                    const std::string& data_format,
+                    const std::string& pooling_type,
+                    bool global_pooling,
+                    bool adaptive,
+                    const std::string& padding_algorithm,
+                    const float norm_type,
+                    DenseTensor* out);
+
 template <typename T, typename Context>
 void Pool2dGPUDNNKernel(const Context& ctx,
                         const DenseTensor& x,
diff --git a/paddle/phi/kernels/reduce_kernel_impl.cc b/paddle/phi/kernels/reduce_kernel_impl.cc
index 000cb99034c26..9319248099903 100644
--- a/paddle/phi/kernels/reduce_kernel_impl.cc
+++ b/paddle/phi/kernels/reduce_kernel_impl.cc
@@ -20,10 +20,16 @@ namespace phi {
 // oneDNN's reduction kernel is optimized only for reducing throughout the
 // most outer dims, so in case of another type of reduction, it would be
 // better to fallback to native implementation
-inline bool HasOptimizedOneDNNKernel(const KernelContext* ctx) {
+inline bool HasOptimizedOneDNNKernel(const KernelContext* ctx,
+                                     const bool mean_op) {
   const DenseTensor& x = ctx->InputAt<phi::DenseTensor>(0);
-  const TensorRef& dims_tmp = ctx->AttrAt<TensorRef>(0);
-  IntArray dims_array = IntArray(*dims_tmp.Get());
+  IntArray dims_array;
+  if (mean_op) {
+    dims_array = ctx->AttrAt<IntArray>(0);
+  } else {
+    const TensorRef& dims_tmp = ctx->AttrAt<TensorRef>(0);
+    dims_array = IntArray(*dims_tmp.Get());
+  }
   int ndims = x.dims().size();
   const bool reduce_all = recompute_reduce_all(x, dims_array);
   auto dims = dims_array.GetData();
@@ -53,7 +59,15 @@ inline bool HasOptimizedOneDNNKernel(const KernelContext* ctx) {
 
 bool ReduceCheckIfOneDNNSupport(const KernelContext* ctx) {
   if (ctx->InputAt<phi::DenseTensor>(0).dims().size() > 5 ||
-      !HasOptimizedOneDNNKernel(ctx)) {
+      !HasOptimizedOneDNNKernel(ctx, false)) {
+    return false;
+  }
+  return true;
+}
+
+bool ReduceMeanCheckIfOneDNNSupport(const KernelContext* ctx) {
+  if (ctx->InputAt<phi::DenseTensor>(0).dims().size() > 5 ||
+      !HasOptimizedOneDNNKernel(ctx, true)) {
     return false;
   }
   return true;
diff --git a/paddle/phi/kernels/reduce_kernel_impl.h b/paddle/phi/kernels/reduce_kernel_impl.h
index aef4f57ddbdcf..e117f6ab335dd 100644
--- a/paddle/phi/kernels/reduce_kernel_impl.h
+++ b/paddle/phi/kernels/reduce_kernel_impl.h
@@ -21,4 +21,6 @@ bool ReduceCheckIfOneDNNSupport(const KernelContext* ctx);
 
 bool ReduceGradCheckIfOneDNNSupport(const KernelContext* ctx);
 
+bool ReduceMeanCheckIfOneDNNSupport(const KernelContext* ctx);
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/reduce_mean_kernel.cc b/paddle/phi/kernels/reduce_mean_kernel.cc
index 16b3abf0e2931..a657e7ba8c01d 100644
--- a/paddle/phi/kernels/reduce_mean_kernel.cc
+++ b/paddle/phi/kernels/reduce_mean_kernel.cc
@@ -67,7 +67,7 @@ PD_REGISTER_KERNEL(mean, KPS, ALL_LAYOUT, phi::MeanKernel, float) {}
 #if defined(PADDLE_WITH_DNNL)
 PD_REGISTER_KERNEL(
     mean, OneDNN, ONEDNN, phi::MeanKernel, float, phi::dtype::bfloat16) {
-  kernel->check_if_onednn_kernel_support_ = phi::ReduceCheckIfOneDNNSupport;
+  kernel->check_if_onednn_kernel_support_ = phi::ReduceMeanCheckIfOneDNNSupport;
 }
 #endif
 
diff --git a/paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.cc b/paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.cc
index 77ae06206f19d..8664f3b4aaf20 100644
--- a/paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.cc
@@ -21,8 +21,7 @@
 #include "paddle/phi/core/mixed_vector.h"
 #include "paddle/phi/kernels/cpu/hsigmoid_loss_grad.h"
 
-namespace phi {
-namespace sr {
+namespace phi::sr {
 
 static std::vector<int64_t> PathToRows(const DenseTensor& path) {
   std::set<int64_t> rows;
@@ -80,8 +79,7 @@ void HSigmoidLossGradKernel(const Context& ctx,
                                      w_grad);
 }
 
-}  // namespace sr
-}  // namespace phi
+}  // namespace phi::sr
 
 PD_REGISTER_KERNEL(hsigmoid_loss_grad_sr,
                    CPU,
diff --git a/paddle/phi/kernels/selected_rows/merge_selected_rows_kernel.cc b/paddle/phi/kernels/selected_rows/merge_selected_rows_kernel.cc
index a5d2e66787316..19b72361feda7 100644
--- a/paddle/phi/kernels/selected_rows/merge_selected_rows_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/merge_selected_rows_kernel.cc
@@ -20,8 +20,7 @@
 
 #include "paddle/phi/kernels/funcs/selected_rows_functor.h"
 
-namespace phi {
-namespace sr {
+namespace phi::sr {
 
 template <typename T, typename Context>
 void MergeSelectedRowsKernel(const Context& dev_ctx,
@@ -31,8 +30,7 @@ void MergeSelectedRowsKernel(const Context& dev_ctx,
   merge_func(dev_ctx, x, out);
 }
 
-}  // namespace sr
-}  // namespace phi
+}  // namespace phi::sr
 
 PD_REGISTER_KERNEL(merge_selected_rows,
                    CPU,
diff --git a/paddle/phi/kernels/selected_rows/shape_kernel.cc b/paddle/phi/kernels/selected_rows/shape_kernel.cc
index e4b53658f42ed..ee7c0d64670d4 100644
--- a/paddle/phi/kernels/selected_rows/shape_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/shape_kernel.cc
@@ -21,8 +21,7 @@ limitations under the License. */
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/shape_kernel.h"
 
-namespace phi {
-namespace sr {
+namespace phi::sr {
 
 template <typename T, typename Context>
 void ShapeKernel(const Context& ctx,
@@ -31,8 +30,7 @@ void ShapeKernel(const Context& ctx,
   phi::ShapeKernel<T, Context>(ctx, input.value(), out);
 }
 
-}  // namespace sr
-}  // namespace phi
+}  // namespace phi::sr
 
 PD_REGISTER_KERNEL(shape_sr,
                    CPU,
diff --git a/paddle/phi/kernels/sparse/batch_norm_grad_kernel.cc b/paddle/phi/kernels/sparse/batch_norm_grad_kernel.cc
index 73af07da806e0..37c517246f89e 100644
--- a/paddle/phi/kernels/sparse/batch_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/batch_norm_grad_kernel.cc
@@ -18,8 +18,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/sparse/empty_kernel.h"
 
-namespace phi {
-namespace sparse {
+namespace phi::sparse {
 
 template <typename T, typename Context>
 void BatchNormCooGradKernel(const Context& dev_ctx,
@@ -76,8 +75,7 @@ void BatchNormCooGradKernel(const Context& dev_ctx,
                                        bias_grad);
 }
 
-}  // namespace sparse
-}  // namespace phi
+}  // namespace phi::sparse
 
 PD_REGISTER_KERNEL(batch_norm_coo_grad,
                    CPU,
diff --git a/paddle/phi/kernels/sparse/cpu/addmm_kernel.cc b/paddle/phi/kernels/sparse/cpu/addmm_kernel.cc
index 991ee7bcaa778..430fd2462b1a7 100644
--- a/paddle/phi/kernels/sparse/cpu/addmm_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/addmm_kernel.cc
@@ -16,8 +16,7 @@ limitations under the License. */
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-namespace phi {
-namespace sparse {
+namespace phi::sparse {
 
 /* DENSE + COO @ DENSE -> DENSE */
 template <typename T, typename Context>
@@ -45,8 +44,7 @@ void AddmmCsrDenseKernel(const Context& dev_ctx UNUSED,
       "Not support CPU kernel of 'sparse.addmm' now."));
 }
 
-}  // namespace sparse
-}  // namespace phi
+}  // namespace phi::sparse
 
 PD_REGISTER_KERNEL(addmm_coo_dense,
                    CPU,
diff --git a/paddle/phi/kernels/sparse/cpu/conv_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/conv_grad_kernel.cc
index d18bdc4b12e96..4d62c8f70b579 100644
--- a/paddle/phi/kernels/sparse/cpu/conv_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/conv_grad_kernel.cc
@@ -19,8 +19,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/sparse/cpu/conv.h"
 
-namespace phi {
-namespace sparse {
+namespace phi::sparse {
 
 // rulebook:
 //[
@@ -215,8 +214,7 @@ void Conv3dCooGradKernel(const Context& dev_ctx,
                                           kernel_grad);
       }));
 }
-}  // namespace sparse
-}  // namespace phi
+}  // namespace phi::sparse
 
 PD_REGISTER_KERNEL(conv3d_coo_grad,
                    CPU,
diff --git a/paddle/phi/kernels/sparse/cpu/elementwise_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/elementwise_grad_kernel.cc
index 88a01e1135b7b..c5cd5ac42c275 100644
--- a/paddle/phi/kernels/sparse/cpu/elementwise_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/elementwise_grad_kernel.cc
@@ -30,8 +30,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/sparse/flatten_indices.h"
 #include "paddle/phi/kernels/sparse/empty_kernel.h"
 
-namespace phi {
-namespace sparse {
+namespace phi::sparse {
 
 template <typename T, typename IntT, typename Context>
 void AllocCsrPtr(const Context& dev_ctx,
@@ -432,8 +431,7 @@ DEFINE_ELEMENTWISE_GRAD_KERNEL(Add)
 DEFINE_ELEMENTWISE_GRAD_KERNEL(Subtract)
 DEFINE_ELEMENTWISE_GRAD_KERNEL(Multiply)
 
-}  // namespace sparse
-}  // namespace phi
+}  // namespace phi::sparse
 
 PD_REGISTER_KERNEL(add_csr_csr_grad,
                    CPU,
diff --git a/paddle/phi/kernels/sparse/cpu/fused_attention_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/fused_attention_grad_kernel.cc
index 416b715a9a6a2..74436cbc85b52 100644
--- a/paddle/phi/kernels/sparse/cpu/fused_attention_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/fused_attention_grad_kernel.cc
@@ -17,8 +17,7 @@ limitations under the License. */
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-namespace phi {
-namespace sparse {
+namespace phi::sparse {
 
 template <typename T, typename Context>
 void FusedAttentionCsrGradKernel(const Context& dev_ctx,
@@ -34,5 +33,4 @@ void FusedAttentionCsrGradKernel(const Context& dev_ctx,
       "Not support CPU kernel of 'sparse.nn.functional.fused_attention' now");
 }
 
-}  // namespace sparse
-}  // namespace phi
+}  // namespace phi::sparse
diff --git a/paddle/phi/kernels/sparse/cpu/fused_attention_kernel.cc b/paddle/phi/kernels/sparse/cpu/fused_attention_kernel.cc
index 11c9e2d5c2007..2847ebff7e092 100644
--- a/paddle/phi/kernels/sparse/cpu/fused_attention_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/fused_attention_kernel.cc
@@ -17,8 +17,7 @@ limitations under the License. */
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-namespace phi {
-namespace sparse {
+namespace phi::sparse {
 
 template <typename T, typename Context>
 void FusedAttentionCsrKernel(
@@ -35,5 +34,4 @@ void FusedAttentionCsrKernel(
       "Not support CPU kernel of 'sparse.nn.functional.fused_attention' now");
 }
 
-}  // namespace sparse
-}  // namespace phi
+}  // namespace phi::sparse
diff --git a/paddle/phi/kernels/sparse/cpu/mask_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/mask_grad_kernel.cc
new file mode 100644
index 0000000000000..3503c88b2ef8b
--- /dev/null
+++ b/paddle/phi/kernels/sparse/cpu/mask_grad_kernel.cc
@@ -0,0 +1,56 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/sparse/mask_grad_kernel.h"
+#include "paddle/phi/kernels/sparse/mask_kernel.h"
+#include "paddle/phi/kernels/sparse/sparse_utils_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(mask_as_coo_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::MaskAsCooGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {
+  kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
+
+PD_REGISTER_KERNEL(mask_as_csr_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::MaskAsCsrGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {
+  kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_CSR);
+}
diff --git a/paddle/phi/kernels/sparse/cpu/mask_kernel.cc b/paddle/phi/kernels/sparse/cpu/mask_kernel.cc
index 5213dd44a4c07..7b8d24a440e74 100644
--- a/paddle/phi/kernels/sparse/cpu/mask_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/mask_kernel.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/sparse/mask_kernel.h"
+#include "paddle/phi/kernels/sparse/sparse_utils_kernel.h"
 
 #include "paddle/common/ddim.h"
 #include "paddle/phi/api/ext/dispatch.h"
@@ -24,8 +25,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/sparse/flatten_indices.h"
 
-namespace phi {
-namespace sparse {
+namespace phi::sparse {
 
 template <typename T, typename IntT>
 void MaskCooCPUKernel(const CPUContext& dev_ctx,
@@ -75,16 +75,116 @@ void MaskCooCPUKernel(const CPUContext& dev_ctx,
  * x and mask must have the same shape.
  **/
 template <typename T, typename Context>
-void MaskCooKernel(const Context& dev_ctx,
-                   const DenseTensor& x,
-                   const SparseCooTensor& mask,
-                   SparseCooTensor* out) {
+void MaskAsCooKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const SparseCooTensor& mask,
+                     SparseCooTensor* out) {
   PD_VISIT_BASE_INTEGRAL_TYPES(
       mask.indices().dtype(), "MaskCooCPUKernel", ([&] {
         MaskCooCPUKernel<T, data_t>(dev_ctx, x, mask, out);
       }));
 }
 
+template <typename T, typename IntT>
+void MaskCsr2DCPUKernel(const CPUContext& dev_ctx,
+                        const DenseTensor& x,
+                        const SparseCsrTensor& mask,
+                        SparseCsrTensor* out) {
+  const DenseTensor& mask_cols = mask.cols();
+  const DenseTensor& mask_crows = mask.crows();
+  int64_t num_non_zeros = mask.nnz();
+
+  DenseTensor out_cols = phi::EmptyLike<IntT>(dev_ctx, mask_cols);
+  DenseTensor out_crows = phi::EmptyLike<IntT>(dev_ctx, mask_crows);
+  DenseTensor out_values = phi::Empty<T>(dev_ctx, {num_non_zeros});
+
+  phi::Copy(dev_ctx, mask_cols, dev_ctx.GetPlace(), false, &out_cols);
+  phi::Copy(dev_ctx, mask_crows, dev_ctx.GetPlace(), false, &out_crows);
+
+  int64_t numel = 0;
+  for (int64_t i = 0; i < mask_crows.numel() - 1; ++i) {
+    for (int64_t j = mask_crows.data<IntT>()[i];
+         j < mask_crows.data<IntT>()[i + 1];
+         ++j) {
+      IntT col_idx = mask_cols.data<IntT>()[numel];
+
+      out_values.data<T>()[numel] =
+          x.data<T>()[(i / x.dims()[0]) * x.dims()[1] +
+                      (i % x.dims()[0]) * x.dims()[1] + col_idx];
+
+      ++numel;
+    }
+  }
+
+  out->SetMember(out_crows, out_cols, out_values, x.dims());
+}
+
+template <typename T, typename IntT>
+void MaskCsr3DCPUKernel(const CPUContext& dev_ctx,
+                        const DenseTensor& x,
+                        const SparseCsrTensor& mask,
+                        SparseCsrTensor* out) {
+  const DenseTensor& mask_cols = mask.cols();
+  const DenseTensor& mask_crows = mask.crows();
+  int64_t num_non_zeros = mask.nnz();
+
+  DenseTensor out_cols = phi::EmptyLike<IntT>(dev_ctx, mask_cols);
+  DenseTensor out_crows = phi::EmptyLike<IntT>(dev_ctx, mask_crows);
+  DenseTensor out_values = phi::Empty<T>(dev_ctx, {num_non_zeros});
+
+  phi::Copy(dev_ctx, mask_cols, dev_ctx.GetPlace(), false, &out_cols);
+  phi::Copy(dev_ctx, mask_crows, dev_ctx.GetPlace(), false, &out_crows);
+
+  int64_t numel = 0;
+  for (int64_t i = 0; i < mask_crows.numel() - 1; ++i) {
+    for (int64_t j = mask_crows.data<IntT>()[i];
+         j < mask_crows.data<IntT>()[i + 1];
+         ++j) {
+      IntT col_idx = mask_cols.data<IntT>()[numel];
+
+      out_values.data<T>()[numel] =
+          x.data<T>()[(i / (mask_crows.numel() / x.dims()[0])) *
+                          (x.dims()[1] * x.dims()[2]) +
+                      (i % (mask_crows.numel() / x.dims()[0])) * x.dims()[2] +
+                      col_idx];
+
+      ++numel;
+    }
+  }
+
+  out->SetMember(out_crows, out_cols, out_values, x.dims());
+}
+
+/**
+ * @brief Filter the DenseTensor x by the
+ * mask.crows(), mask.cols() and output a SparseCsrTensor
+ * x and mask must have the same shape.
+ **/
+template <typename T, typename Context>
+void MaskAsCsrKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const SparseCsrTensor& mask,
+                     SparseCsrTensor* out) {
+  const phi::DDim& x_dims = x.dims();
+  if (x_dims.size() == 2) {
+    PD_VISIT_BASE_INTEGRAL_TYPES(
+        mask.crows().dtype(), "MaskCsr2DCPUKernel", ([&] {
+          MaskCsr2DCPUKernel<T, data_t>(dev_ctx, x, mask, out);
+        }));
+  } else if (x_dims.size() == 3) {
+    PD_VISIT_BASE_INTEGRAL_TYPES(
+        mask.crows().dtype(), "MaskCsr3DCPUKernel", ([&] {
+          MaskCsr3DCPUKernel<T, data_t>(dev_ctx, x, mask, out);
+        }));
+  } else {
+    // throw exception
+    phi::errors::InvalidArgument(
+        "mask_as for Sparse CSR Tensor only support 2-D or 3-D, but got "
+        "%d-D.",
+        x_dims.size());
+  }
+}
+
 template <typename T, typename IntT>
 void MaskHelperCooCPUKernel(const CPUContext& dev_ctx,
                             const SparseCooTensor& x,
@@ -154,13 +254,28 @@ void MaskHelperCooKernel(const Context& dev_ctx,
       }));
 }
 
-}  // namespace sparse
-}  // namespace phi
+}  // namespace phi::sparse
+
+PD_REGISTER_KERNEL(mask_helper_coo,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::MaskHelperCooKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
 
-PD_REGISTER_KERNEL(mask_coo,
+PD_REGISTER_KERNEL(mask_as_coo,
                    CPU,
                    ALL_LAYOUT,
-                   phi::sparse::MaskCooKernel,
+                   phi::sparse::MaskAsCooKernel,
                    float,
                    double,
                    uint8_t,
@@ -174,18 +289,19 @@ PD_REGISTER_KERNEL(mask_coo,
   kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO);
 }
 
-PD_REGISTER_KERNEL(mask_helper_coo,
+PD_REGISTER_KERNEL(mask_as_csr,
                    CPU,
                    ALL_LAYOUT,
-                   phi::sparse::MaskHelperCooKernel,
+                   phi::sparse::MaskAsCsrKernel,
                    float,
                    double,
-                   phi::dtype::float16,
                    uint8_t,
+                   int8_t,
                    int16_t,
                    int,
                    int64_t,
+                   bool,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {
-  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+  kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_CSR);
 }
diff --git a/paddle/phi/kernels/sparse/cpu/matmul_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/matmul_grad_kernel.cc
index 6d22d2a336e7e..cdd7efdb20924 100644
--- a/paddle/phi/kernels/sparse/cpu/matmul_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/matmul_grad_kernel.cc
@@ -17,8 +17,7 @@ limitations under the License. */
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-namespace phi {
-namespace sparse {
+namespace phi::sparse {
 
 // TODO(zhouwei25): implement CPU backward kernel of " CSR @ DENSE -> DENSE"
 template <typename T, typename Context>
@@ -44,8 +43,7 @@ void MaskedMatmulCsrGradKernel(const Context& dev_ctx UNUSED,
       "Not support CPU backward kernel of 'sparse.masked_matmul' now."));
 }
 
-}  // namespace sparse
-}  // namespace phi
+}  // namespace phi::sparse
 
 PD_REGISTER_KERNEL(matmul_csr_dense_grad,
                    CPU,
diff --git a/paddle/phi/kernels/sparse/cpu/matmul_kernel.cc b/paddle/phi/kernels/sparse/cpu/matmul_kernel.cc
index fd70dc911cfde..5e6aa016d6c3e 100644
--- a/paddle/phi/kernels/sparse/cpu/matmul_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/matmul_kernel.cc
@@ -17,8 +17,7 @@ limitations under the License. */
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-namespace phi {
-namespace sparse {
+namespace phi::sparse {
 
 // TODO(zhouwei25): implement CPU kernel of " CSR @ DENSE -> DENSE"
 template <typename T, typename Context>
@@ -41,8 +40,7 @@ void MaskedMatmulCsrKernel(const Context& dev_ctx UNUSED,
       "Not support CPU kernel of 'sparse.masked_matmul' now."));
 }
 
-}  // namespace sparse
-}  // namespace phi
+}  // namespace phi::sparse
 
 PD_REGISTER_KERNEL(matmul_csr_dense,
                    CPU,
diff --git a/paddle/phi/kernels/sparse/cpu/mv_kernel.cc b/paddle/phi/kernels/sparse/cpu/mv_kernel.cc
index 22abdb3ad12a3..68f7efd05d70d 100644
--- a/paddle/phi/kernels/sparse/cpu/mv_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/mv_kernel.cc
@@ -17,8 +17,7 @@ limitations under the License. */
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-namespace phi {
-namespace sparse {
+namespace phi::sparse {
 
 template <typename T, typename Context>
 void MvCsrKernel(const Context& dev_ctx UNUSED,
@@ -38,8 +37,7 @@ void MvCooKernel(const Context& dev_ctx UNUSED,
       phi::errors::Unimplemented("Not support CPU kernel of 'sparse.mv' now."));
 }
 
-}  // namespace sparse
-}  // namespace phi
+}  // namespace phi::sparse
 
 PD_REGISTER_KERNEL(
     mv_csr, CPU, ALL_LAYOUT, phi::sparse::MvCsrKernel, float, double) {
diff --git a/paddle/phi/kernels/sparse/cpu/slice_kernel.cc b/paddle/phi/kernels/sparse/cpu/slice_kernel.cc
index 81af8339f88a9..20614fa10b04b 100644
--- a/paddle/phi/kernels/sparse/cpu/slice_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/slice_kernel.cc
@@ -20,8 +20,7 @@
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/slice_utils.h"
 
-namespace phi {
-namespace sparse {
+namespace phi::sparse {
 
 template <typename T, typename Context>
 void SliceCooCompute(const Context& dev_ctx,
@@ -303,8 +302,7 @@ void SliceCsrKernel(const Context& dev_ctx,
       x_dims, &axes_vec, &starts_vec, &ends_vec);
   SliceCsrCompute<T, Context>(dev_ctx, x, axes_vec, starts_vec, ends_vec, out);
 }
-}  // namespace sparse
-}  // namespace phi
+}  // namespace phi::sparse
 
 PD_REGISTER_KERNEL(slice_coo,
                    CPU,
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
index 0c5e6857de24c..4eea70631bd60 100644
--- a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
@@ -262,9 +262,9 @@ void CooToDenseCPUKernel(const CPUContext& dev_ctx,
                          const SparseCooTensor& x,
                          DenseTensor* out) {
   const auto non_zero_num = x.nnz();
-  const auto dense_dims = x.dims();
-  const auto indices = x.indices();
-  const auto values = x.values();
+  const auto& dense_dims = x.dims();
+  const auto& indices = x.indices();
+  const auto& values = x.values();
   const auto indices_dims = common::vectorize<int>(indices.dims());
   int64_t sparse_dim = indices_dims[0];
   if (indices_dims.size() == 1) {
diff --git a/paddle/phi/kernels/sparse/cpu/transpose_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/transpose_grad_kernel.cc
index 58a9720e1732b..70b737c2ec0a2 100644
--- a/paddle/phi/kernels/sparse/cpu/transpose_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/transpose_grad_kernel.cc
@@ -20,8 +20,7 @@
 #include "paddle/phi/kernels/sparse/empty_kernel.h"
 #include "paddle/phi/kernels/sparse/impl/unary_grad_kernel_impl.h"
 
-namespace phi {
-namespace sparse {
+namespace phi::sparse {
 
 std::vector<int> get_cpu_grad_perm(std::vector<int> perm) {
   std::vector<int> grad_perm(perm.size());
@@ -48,8 +47,7 @@ void TransposeCsrGradKernel(const Context& dev_ctx,
   std::vector<int> grad_perm = get_cpu_grad_perm(perm);
   TransposeCsrKernel<T, Context>(dev_ctx, dout, grad_perm, dx);
 }
-}  // namespace sparse
-}  // namespace phi
+}  // namespace phi::sparse
 
 PD_REGISTER_KERNEL(transpose_coo_grad,
                    CPU,
diff --git a/paddle/phi/kernels/sparse/cpu/transpose_kernel.cc b/paddle/phi/kernels/sparse/cpu/transpose_kernel.cc
index bee2fe61ded54..6ae389ad90f46 100644
--- a/paddle/phi/kernels/sparse/cpu/transpose_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/transpose_kernel.cc
@@ -21,8 +21,7 @@
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 #include "paddle/phi/kernels/sparse/empty_kernel.h"
 
-namespace phi {
-namespace sparse {
+namespace phi::sparse {
 
 template <typename T, typename Context>
 void TransposeCooKernel(const Context& dev_ctx,
@@ -201,8 +200,7 @@ void TransposeCsrKernel(const Context& dev_ctx,
     }
   }
 }
-}  // namespace sparse
-}  // namespace phi
+}  // namespace phi::sparse
 
 PD_REGISTER_KERNEL(transpose_coo,
                    CPU,
diff --git a/paddle/phi/kernels/sparse/empty_kernel.cc b/paddle/phi/kernels/sparse/empty_kernel.cc
index 2fb11e7a66f2e..07087445b1eb6 100644
--- a/paddle/phi/kernels/sparse/empty_kernel.cc
+++ b/paddle/phi/kernels/sparse/empty_kernel.cc
@@ -18,8 +18,7 @@ limitations under the License. */
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 
-namespace phi {
-namespace sparse {
+namespace phi::sparse {
 
 template <typename T, typename Context>
 void EmptyLikeCooKernel(const Context& dev_ctx,
@@ -47,8 +46,7 @@ void EmptyLikeCsrKernel(const Context& dev_ctx,
   out->set_meta(x.meta());
   dev_ctx.template Alloc<T>(out_values);
 }
-}  // namespace sparse
-}  // namespace phi
+}  // namespace phi::sparse
 
 PD_REGISTER_KERNEL(empty_like_coo,
                    CPU,
diff --git a/paddle/phi/kernels/sparse/gpu/conv_kernel_impl.cuh b/paddle/phi/kernels/sparse/gpu/conv_kernel_impl.cuh
index 775c23def14b0..3b6de498ef5b5 100644
--- a/paddle/phi/kernels/sparse/gpu/conv_kernel_impl.cuh
+++ b/paddle/phi/kernels/sparse/gpu/conv_kernel_impl.cuh
@@ -566,7 +566,7 @@ __global__ void __launch_bounds__(128) conv_forward_cuda_setting3_mode0_f16f16f3
 
 // conv_forward_cuda_m128n16k16_f32f32f32
 template <int K_ld_factor, int N_ld_factor, bool K_ld_check, bool N_ld_check>
-__global__ void __launch_bounds__(64) conv_forward_cuda_setting1_mode0_f32f32f32(int M, int K_original, int N, int kernel_volume, float* __restrict__ A, float* __restrict__ B, int* __restrict__ out_in_map, float* __restrict__ C) 
+__global__ void __launch_bounds__(64) conv_forward_cuda_setting1_mode0_f32f32f32(int M, int K_original, int N, int kernel_volume, float* __restrict__ A, float* __restrict__ B, int* __restrict__ out_in_map, float* __restrict__ C)
 {
 
   const int K_tile = 16;
@@ -578,27 +578,27 @@ __global__ void __launch_bounds__(64) conv_forward_cuda_setting1_mode0_f32f32f32
   __shared__ float B_shared[256];
 
   #pragma unroll
-  for (int i = 0; i < 32; ++i)   
+  for (int i = 0; i < 32; ++i)
   {
     C_local[i] = 0.0;
   }
-  
+
   int K_loops = K_implicit / 16;
-  int block_num_n = (N - 1) / 16 + 1; 
+  int block_num_n = (N - 1) / 16 + 1;
   int blockIdx_m = (int)blockIdx.x / block_num_n;
   int blockIdx_n = (int)blockIdx.x % block_num_n;
   int threadIdx_x = (int)threadIdx.x;
 
   // hoisting shared pointer offsets
-  int * out_in_map_ptr = out_in_map 
-                         + (blockIdx_m * 128 + (threadIdx_x / (16/4)))* kernel_volume;  
+  int * out_in_map_ptr = out_in_map
+                         + (blockIdx_m * 128 + (threadIdx_x / (16/4)))* kernel_volume;
 
-  float * B_ptr = B 
-                  + (threadIdx_x / (16/4)) * N 
-                  + (blockIdx_n * 16) + ((threadIdx_x * 4) % 16); 
+  float * B_ptr = B
+                  + (threadIdx_x / (16/4)) * N
+                  + (blockIdx_n * 16) + ((threadIdx_x * 4) % 16);
 
   float * A_shared_ptr = A_shared + (threadIdx_x * 4);
-  float * A_shared_reduce_ptr =  A_shared + ((threadIdx_x / 4) * 16); 
+  float * A_shared_reduce_ptr =  A_shared + ((threadIdx_x / 4) * 16);
   float * B_shared_ptr = B_shared + (threadIdx_x * 4);
   float * B_shared_reduce_ptr = B_shared + (threadIdx_x % 4);
 
@@ -648,7 +648,7 @@ __global__ void __launch_bounds__(64) conv_forward_cuda_setting1_mode0_f32f32f32
       }
 
       int* out_in_map_ptr_local = out_in_map_ptr + k_0 * 16 / K_tile_padded;
-      float* A_ptr_local = A  + (k_0 * 16 % K_tile_padded) + channel_offset_A;  
+      float* A_ptr_local = A  + (k_0 * 16 % K_tile_padded) + channel_offset_A;
 
       float* B_ptr_local;
       if constexpr (K_ld_check)
@@ -661,14 +661,14 @@ __global__ void __launch_bounds__(64) conv_forward_cuda_setting1_mode0_f32f32f32
       for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 8; ++ax0_ax1_fused_0)
       {
 
-        int input_idx = *(out_in_map_ptr_local + (ax0_ax1_fused_0 *16) * kernel_volume); 
+        int input_idx = *(out_in_map_ptr_local + (ax0_ax1_fused_0 *16) * kernel_volume);
         if (input_idx != -1)
         {
           uint4 A_loaded = make_uint4(0, 0, 0, 0);
           global_load<K_ld_factor>(A_loaded, A_ptr_local + (input_idx * K_original) , A_pred_guard);
           *(uint4 *)(A_shared_ptr + (ax0_ax1_fused_0 * 256)) = A_loaded;
         }
-        else 
+        else
         {
           *(uint4*)(A_shared_ptr + (ax0_ax1_fused_0 * 256)) = make_uint4(0, 0, 0, 0);
         }
@@ -678,23 +678,23 @@ __global__ void __launch_bounds__(64) conv_forward_cuda_setting1_mode0_f32f32f32
       for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 1; ++ax0_ax1_fused_0_1)
       {
         uint4 B_loaded = make_uint4(0, 0, 0, 0);
-        global_load<N_ld_factor>(B_loaded, B_ptr_local + (ax0_ax1_fused_0_1 * 16) * N, B_pred_guard); 
+        global_load<N_ld_factor>(B_loaded, B_ptr_local + (ax0_ax1_fused_0_1 * 16) * N, B_pred_guard);
         *(uint4 *)(B_shared_ptr + (ax0_ax1_fused_0_1 * 256)) = B_loaded;
       }
 
       __syncthreads();
       #pragma unroll
-      for (int k_1 = 0; k_1 < ( 16 / 4); ++k_1) 
+      for (int k_1 = 0; k_1 < ( 16 / 4); ++k_1)
       {
         #pragma unroll
-        for (int k_2 = 0; k_2 < 4; ++k_2) 
+        for (int k_2 = 0; k_2 < 4; ++k_2)
         {
           int vk_in_block = (k_1 << 2) + k_2;
           #pragma unroll
-          for (int i = 0; i < 32; ++i) 
+          for (int i = 0; i < 32; ++i)
           {
-            C_local[i] = C_local[i] + 
-                            A_shared_reduce_ptr[((i / 4) * 16) * 16 + vk_in_block] 
+            C_local[i] = C_local[i] +
+                            A_shared_reduce_ptr[((i / 4) * 16) * 16 + vk_in_block]
                             * B_shared_reduce_ptr[(vk_in_block * 16) + ((i % 4) * 4)];
 
           }
@@ -707,7 +707,7 @@ __global__ void __launch_bounds__(64) conv_forward_cuda_setting1_mode0_f32f32f32
   for (int i = 0; i < 32; ++i)
   {
       int location_cur = location_offset + ((i / 4) * 16);
-      int vn = C_n_offset + ((i % 4) * 4); 
+      int vn = C_n_offset + ((i % 4) * 4);
 
       if constexpr (N_ld_check)
       {
@@ -723,34 +723,34 @@ __global__ void __launch_bounds__(64) conv_forward_cuda_setting1_mode0_f32f32f32
 }
 
 // conv_forward_cuda_m128n16k32_f32f32f32
-__global__ void __launch_bounds__(64) conv_forward_cuda_setting2_mode0_f32f32f32(int M, int K_original, int N, int kernel_volume, float* __restrict__ A, float* __restrict__ B, int* __restrict__ out_in_map, float* __restrict__ C) 
+__global__ void __launch_bounds__(64) conv_forward_cuda_setting2_mode0_f32f32f32(int M, int K_original, int N, int kernel_volume, float* __restrict__ A, float* __restrict__ B, int* __restrict__ out_in_map, float* __restrict__ C)
 {
   float C_local[32];
   __shared__ float A_shared[4096];
   __shared__ float B_shared[512];
 
   #pragma unroll
-  for (int i = 0; i < 32; ++i)   
+  for (int i = 0; i < 32; ++i)
   {
     C_local[i] = 0.0;
   }
-  
+
   int K_loops = (K_original * kernel_volume - 1) / 32 + 1;
-  int block_num_n = (N - 1) / 16 + 1; 
+  int block_num_n = (N - 1) / 16 + 1;
   int blockIdx_m = (int)blockIdx.x / block_num_n;
   int blockIdx_n = (int)blockIdx.x % block_num_n;
   int threadIdx_x = (int)threadIdx.x;
 
   // hoisting shared pointer offsets
-  int * out_in_map_ptr = out_in_map 
-                         + (blockIdx_m * 128 + (threadIdx_x / (32/4)))* kernel_volume;  
+  int * out_in_map_ptr = out_in_map
+                         + (blockIdx_m * 128 + (threadIdx_x / (32/4)))* kernel_volume;
 
-  float * B_ptr = B 
-                  + (threadIdx_x / (16/4)) * N 
-                  + (blockIdx_n * 16) + ((threadIdx_x * 4) % 16); 
+  float * B_ptr = B
+                  + (threadIdx_x / (16/4)) * N
+                  + (blockIdx_n * 16) + ((threadIdx_x * 4) % 16);
 
   float * A_shared_ptr = A_shared + (threadIdx_x * 4);
-  float * A_shared_reduce_ptr =  A_shared + ((threadIdx_x / 4) * 32); 
+  float * A_shared_reduce_ptr =  A_shared + ((threadIdx_x / 4) * 32);
   float * B_shared_ptr = B_shared + (threadIdx_x * 4);
   float * B_shared_reduce_ptr = B_shared + (threadIdx_x % 4);
 
@@ -762,7 +762,7 @@ __global__ void __launch_bounds__(64) conv_forward_cuda_setting2_mode0_f32f32f32
   #pragma unroll
   for (int k_0 = 0; k_0 < K_loops; ++k_0) {
 
-    int channel_offset = k_0 % (K_original / 32) * 32 + channel_offset_A; 
+    int channel_offset = k_0 % (K_original / 32) * 32 + channel_offset_A;
     int kernel_offset = k_0 / (K_original / 32);
     int *out_in_map_ptr_k = out_in_map_ptr + kernel_offset;
 
@@ -772,8 +772,8 @@ __global__ void __launch_bounds__(64) conv_forward_cuda_setting2_mode0_f32f32f32
       for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 16; ++ax0_ax1_fused_0)
       {
 
-        int input_idx = *(out_in_map_ptr_k + (ax0_ax1_fused_0 *8) * kernel_volume); 
-        if (input_idx != -1) 
+        int input_idx = *(out_in_map_ptr_k + (ax0_ax1_fused_0 *8) * kernel_volume);
+        if (input_idx != -1)
         {
 
           *(float4*)(A_shared_ptr + (ax0_ax1_fused_0 * 256)) =  // ax0_ax1_fused_0 * elements loaded in each loop
@@ -788,27 +788,27 @@ __global__ void __launch_bounds__(64) conv_forward_cuda_setting2_mode0_f32f32f32
       }
 
       #pragma unroll
-      for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 2; ++ax0_ax1_fused_0_1)    
+      for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 2; ++ax0_ax1_fused_0_1)
       {
 
         *(float4*)(B_shared_ptr + (ax0_ax1_fused_0_1 * 256)) =                 // ax0_ax1_fused_0_1 * elements loaded in each loop
-              *(float4*)(B_ptr + ((k_0 * 32) + (ax0_ax1_fused_0_1 * 16)) * N); 
+              *(float4*)(B_ptr + ((k_0 * 32) + (ax0_ax1_fused_0_1 * 16)) * N);
 
       }
 
       __syncthreads();
       #pragma unroll
-      for (int k_1 = 0; k_1 < ( 32 / 4); ++k_1) 
+      for (int k_1 = 0; k_1 < ( 32 / 4); ++k_1)
       {
         #pragma unroll
-        for (int k_2 = 0; k_2 < 4; ++k_2) 
+        for (int k_2 = 0; k_2 < 4; ++k_2)
         {
           int vk_in_block = (k_1 << 2) + k_2;
           #pragma unroll
-          for (int i = 0; i < 32; ++i) 
+          for (int i = 0; i < 32; ++i)
           {
-            C_local[i] = C_local[i] + 
-                            A_shared_reduce_ptr[((i / 4) * 16) * 32 + vk_in_block] 
+            C_local[i] = C_local[i] +
+                            A_shared_reduce_ptr[((i / 4) * 16) * 32 + vk_in_block]
                             * B_shared_reduce_ptr[(vk_in_block * 16) + ((i % 4) * 4)];
 
           }
@@ -818,44 +818,44 @@ __global__ void __launch_bounds__(64) conv_forward_cuda_setting2_mode0_f32f32f32
   }
 
   #pragma unroll
-  for (int i = 0; i < 32; ++i) 
+  for (int i = 0; i < 32; ++i)
   {
       int location_cur = location_offset + ((i / 4) * 16);
-      int vn = C_n_offset + ((i % 4) * 4); 
+      int vn = C_n_offset + ((i % 4) * 4);
       if (location_cur < M)
         C[location_cur * N + vn] = C_local[i];
    }
 }
 
 // conv_forward_cuda_m128n64k32_f32f32f32
-__global__ void __launch_bounds__(128) conv_forward_cuda_setting3_mode0_f32f32f32(int M, int K_original, int N, int kernel_volume, float* __restrict__ A, float* __restrict__ B, int* __restrict__ out_in_map, float* __restrict__ C) 
+__global__ void __launch_bounds__(128) conv_forward_cuda_setting3_mode0_f32f32f32(int M, int K_original, int N, int kernel_volume, float* __restrict__ A, float* __restrict__ B, int* __restrict__ out_in_map, float* __restrict__ C)
 {
   float C_local[64];
   __shared__ float A_shared[4096];
   __shared__ float B_shared[2048];
 
   #pragma unroll
-  for (int i = 0; i < 64; ++i)   
+  for (int i = 0; i < 64; ++i)
   {
     C_local[i] = 0.0;
   }
-  
+
   int K_loops = (K_original * kernel_volume - 1) / 32 + 1;
-  int block_num_n = (N - 1) / 64 + 1; 
+  int block_num_n = (N - 1) / 64 + 1;
   int blockIdx_m = (int)blockIdx.x / block_num_n;
   int blockIdx_n = (int)blockIdx.x % block_num_n;
   int threadIdx_x = (int)threadIdx.x;
 
   // hoisting shared pointer offsets
-  int * out_in_map_ptr = out_in_map 
-                         + (blockIdx_m * 128 + (threadIdx_x / (32/4)))* kernel_volume;  
+  int * out_in_map_ptr = out_in_map
+                         + (blockIdx_m * 128 + (threadIdx_x / (32/4)))* kernel_volume;
 
-  float * B_ptr = B 
-                  + (threadIdx_x / (64/4)) * N 
-                  + (blockIdx_n * 64) + ((threadIdx_x * 4) % 64); 
+  float * B_ptr = B
+                  + (threadIdx_x / (64/4)) * N
+                  + (blockIdx_n * 64) + ((threadIdx_x * 4) % 64);
 
   float * A_shared_ptr = A_shared + (threadIdx_x * 4);
-  float * A_shared_reduce_ptr =  A_shared + ((threadIdx_x / 16) * 32); 
+  float * A_shared_reduce_ptr =  A_shared + ((threadIdx_x / 16) * 32);
   float * B_shared_ptr = B_shared + (threadIdx_x * 4);
   float * B_shared_reduce_ptr = B_shared + (threadIdx_x % 16);
 
@@ -867,7 +867,7 @@ __global__ void __launch_bounds__(128) conv_forward_cuda_setting3_mode0_f32f32f3
   #pragma unroll
   for (int k_0 = 0; k_0 < K_loops; ++k_0) {
 
-    int channel_offset = k_0 % (K_original / 32) * 32 + channel_offset_A; 
+    int channel_offset = k_0 % (K_original / 32) * 32 + channel_offset_A;
     int kernel_offset = k_0 / (K_original / 32);
     int *out_in_map_ptr_k = out_in_map_ptr + kernel_offset;
 
@@ -877,8 +877,8 @@ __global__ void __launch_bounds__(128) conv_forward_cuda_setting3_mode0_f32f32f3
       for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 8; ++ax0_ax1_fused_0)
       {
 
-        int input_idx = *(out_in_map_ptr_k + (ax0_ax1_fused_0 *16) * kernel_volume); 
-        if (input_idx != -1) 
+        int input_idx = *(out_in_map_ptr_k + (ax0_ax1_fused_0 *16) * kernel_volume);
+        if (input_idx != -1)
         {
 
           *(float4*)(A_shared_ptr + (ax0_ax1_fused_0 * 512)) =  // ax0_ax1_fused_0 * elements loaded in each loop
@@ -893,27 +893,27 @@ __global__ void __launch_bounds__(128) conv_forward_cuda_setting3_mode0_f32f32f3
       }
 
       #pragma unroll
-      for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 4; ++ax0_ax1_fused_0_1)    
+      for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 4; ++ax0_ax1_fused_0_1)
       {
 
         *(float4*)(B_shared_ptr + (ax0_ax1_fused_0_1 * 512)) =                 // ax0_ax1_fused_0_1 * elements loaded in each loop
-              *(float4*)(B_ptr + ((k_0 * 32) + (ax0_ax1_fused_0_1 * 8)) * N); 
+              *(float4*)(B_ptr + ((k_0 * 32) + (ax0_ax1_fused_0_1 * 8)) * N);
 
       }
 
       __syncthreads();
       #pragma unroll
-      for (int k_1 = 0; k_1 < ( 32 / 4); ++k_1) 
+      for (int k_1 = 0; k_1 < ( 32 / 4); ++k_1)
       {
         #pragma unroll
-        for (int k_2 = 0; k_2 < 4; ++k_2) 
+        for (int k_2 = 0; k_2 < 4; ++k_2)
         {
           int vk_in_block = (k_1 << 2) + k_2;
           #pragma unroll
-          for (int i = 0; i < 64; ++i) 
+          for (int i = 0; i < 64; ++i)
           {
-            C_local[i] = C_local[i] + 
-                            A_shared_reduce_ptr[((i / 4) * 8) * 32 + vk_in_block] 
+            C_local[i] = C_local[i] +
+                            A_shared_reduce_ptr[((i / 4) * 8) * 32 + vk_in_block]
                             * B_shared_reduce_ptr[(vk_in_block * 64) + ((i % 4) * 16)];
 
           }
@@ -923,10 +923,10 @@ __global__ void __launch_bounds__(128) conv_forward_cuda_setting3_mode0_f32f32f3
   }
 
   #pragma unroll
-  for (int i = 0; i < 64; ++i) 
+  for (int i = 0; i < 64; ++i)
   {
       int location_cur = location_offset + ((i / 4) * 8);
-      int vn = C_n_offset + ((i % 4) * 16); 
+      int vn = C_n_offset + ((i % 4) * 16);
       if (location_cur < M)
         C[location_cur * N + vn] = C_local[i];
    }
@@ -944,10 +944,10 @@ void conv_forward_implicit_gemm_cuda(
   auto compute_capability = dev_ctx.GetComputeCapability();
   bool allow_fp16 = compute_capability >= 75;
   bool is_half = _in_feats.dtype() == phi::DataType::FLOAT16;
-  
+
   int num_in_feats = _in_feats.dims()[0];
   int num_in_channels = _in_feats.dims()[1];
-  
+
   int kernel_volume = _out_in_map.dims()[1];
   auto out_in_map = const_cast<int*>(_out_in_map.data<int>());
 
@@ -1141,7 +1141,7 @@ void conv_forward_implicit_gemm_cuda(
     {
       int block_num_M = (num_out_feats + 127) / 128;
       int block_num_N = num_out_channels / 64;  //j_factors1
-      dim3 num_blocks(block_num_M * block_num_N); 
+      dim3 num_blocks(block_num_M * block_num_N);
       dim3 threads_per_block(128);
       conv_forward_cuda_setting3_mode0_f32f32f32<<<num_blocks, threads_per_block, 0, dev_ctx.stream()>>>(
           _out_feats.dims()[0], num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
@@ -1150,7 +1150,7 @@ void conv_forward_implicit_gemm_cuda(
     {
       int block_num_M = (num_out_feats + 127) / 128;
       int block_num_N = num_out_channels / 16;  //j_factors1
-      dim3 num_blocks(block_num_M * block_num_N); 
+      dim3 num_blocks(block_num_M * block_num_N);
       dim3 threads_per_block(64);
       conv_forward_cuda_setting2_mode0_f32f32f32<<<num_blocks, threads_per_block, 0, dev_ctx.stream()>>>(
           _out_feats.dims()[0], num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
@@ -1159,7 +1159,7 @@ void conv_forward_implicit_gemm_cuda(
     {
       int block_num_M = (num_out_feats + 127) / 128;
       int block_num_N = (num_out_channels + 15) / 16;  //j_factors1
-      dim3 num_blocks(block_num_M * block_num_N); 
+      dim3 num_blocks(block_num_M * block_num_N);
       dim3 threads_per_block(64);
 
       if (num_in_channels % 16 == 0)
diff --git a/paddle/phi/kernels/sparse/gpu/mask_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/mask_grad_kernel.cu
new file mode 100644
index 0000000000000..1e4e3276d82e1
--- /dev/null
+++ b/paddle/phi/kernels/sparse/gpu/mask_grad_kernel.cu
@@ -0,0 +1,56 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/sparse/mask_grad_kernel.h"
+#include "paddle/phi/kernels/sparse/mask_kernel.h"
+#include "paddle/phi/kernels/sparse/sparse_utils_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(mask_as_coo_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::MaskAsCooGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {
+  kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
+
+PD_REGISTER_KERNEL(mask_as_csr_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::MaskAsCsrGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {
+  kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_CSR);
+}
diff --git a/paddle/phi/kernels/sparse/gpu/mask_kernel.cu b/paddle/phi/kernels/sparse/gpu/mask_kernel.cu
index 0941ad69b0dd2..3459f6802b881 100644
--- a/paddle/phi/kernels/sparse/gpu/mask_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/mask_kernel.cu
@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <thrust/execution_policy.h>
+
 #include "paddle/phi/kernels/sparse/mask_kernel.h"
+#include "paddle/phi/kernels/sparse/sparse_utils_kernel.h"
 
 #include "paddle/common/ddim.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
@@ -106,22 +109,256 @@ void MaskCooGPUKernel(const GPUContext& dev_ctx,
   out->SetMember(out_indices, out_values, dims, true);
 }
 
+template <typename IntT>
+__global__ void ConvertCsrCrowsToCooRows(const IntT* crows_ptr,
+                                         const IntT* crows_offsets,
+                                         IntT* rows_ptr,
+                                         IntT* batch_ptr,
+                                         const int rows) {
+  const int b = blockIdx.y;
+  const int64_t offset = crows_offsets ? crows_offsets[b] : 0;
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int i = tid; i < rows; i += gridDim.x * blockDim.x) {
+    for (int j = crows_ptr[b * (rows + 1) + i];
+         j < crows_ptr[b * (rows + 1) + i + 1];
+         j++) {
+      rows_ptr[offset + j] = i;
+      if (batch_ptr) {
+        batch_ptr[offset + j] = b;
+      }
+    }
+  }
+}
+
+template <typename IntT>
+__global__ void GetBatchSizes(const IntT* crows,
+                              const int rows,
+                              const int batches,
+                              IntT* batch_sizes) {
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < batches) {
+    batch_sizes[tid] = crows[tid * (rows + 1) + rows];
+  }
+}
+
+template <typename T, typename IntT>
+void MaskCsr2DGPUKernel(const GPUContext& dev_ctx,
+                        const DenseTensor& x,
+                        const SparseCsrTensor& mask,
+                        SparseCsrTensor* out) {
+  const DenseTensor& mask_cols = mask.cols();
+  const DenseTensor& mask_crows = mask.crows();
+  int64_t num_non_zeros = mask.nnz();
+
+  DenseTensor out_cols = phi::EmptyLike<IntT>(dev_ctx, mask_cols);
+  DenseTensor out_crows = phi::EmptyLike<IntT>(dev_ctx, mask_crows);
+  DenseTensor out_values = phi::Empty<T>(dev_ctx, {num_non_zeros});
+
+  phi::Copy(dev_ctx, mask_cols, dev_ctx.GetPlace(), false, &out_cols);
+  phi::Copy(dev_ctx, mask_crows, dev_ctx.GetPlace(), false, &out_crows);
+
+  const DDim& dims = x.dims();
+  const int64_t non_zero_num = mask.nnz();
+  int64_t sparse_dim = 2;
+  DenseTensor sparse_offsets = phi::Empty<IntT>(dev_ctx, {sparse_dim});
+  std::vector<int64_t> h_sparse_offsets(sparse_dim);
+  phi::funcs::sparse::CalcOffsetsPerDim(
+      dims, sparse_dim, h_sparse_offsets.data());
+
+  phi::backends::gpu::GpuMemcpyAsync(sparse_offsets.data<int64_t>(),
+                                     &h_sparse_offsets[0],
+                                     sizeof(int64_t) * sparse_dim,
+                                     gpuMemcpyHostToDevice,
+                                     dev_ctx.stream());
+
+  const auto& csr_crows = mask.crows();
+  const auto& csr_cols = mask.cols();
+  const IntT* csr_crows_data = csr_crows.data<IntT>();
+  const IntT* csr_cols_data = csr_cols.data<IntT>();
+
+  const int batches = 1;
+  const int rows = dims[0];
+  auto dims_2d = flatten_to_2d(dims, sparse_dim);
+  const int cols = dims_2d[1];
+
+  DenseTensor indices = phi::Empty<IntT>(dev_ctx, {sparse_dim, non_zero_num});
+  IntT* coo_indices = indices.data<IntT>();
+  IntT* batch_ptr = nullptr;
+  IntT* coo_rows_data = coo_indices;
+  IntT* coo_cols_data = coo_rows_data + non_zero_num;
+  IntT* offsets_ptr = nullptr;
+
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rows, 1);
+  config.block_per_grid.y = batches;
+  ConvertCsrCrowsToCooRows<IntT>
+      <<<config.block_per_grid, config.thread_per_block.x>>>(
+          csr_crows_data, offsets_ptr, coo_rows_data, batch_ptr, rows);
+  phi::backends::gpu::GpuMemcpyAsync(coo_cols_data,
+                                     csr_cols_data,
+                                     sizeof(IntT) * non_zero_num,
+                                     gpuMemcpyDeviceToDevice,
+                                     dev_ctx.stream());
+
+  const T* x_ptr = x.data<T>();
+  const IntT* indices_ptr = coo_indices;
+  T* out_values_ptr = out_values.data<T>();
+
+  auto config_mask =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num * cols, 1);
+  MaskKernel<T, IntT><<<config_mask.block_per_grid,
+                        config_mask.thread_per_block,
+                        0,
+                        dev_ctx.stream()>>>(x_ptr,
+                                            indices_ptr,
+                                            sparse_offsets.data<int64_t>(),
+                                            non_zero_num,
+                                            cols,
+                                            sparse_dim,
+                                            out_values_ptr);
+
+  out->SetMember(out_crows, out_cols, out_values, x.dims());
+}
+
+template <typename T, typename IntT>
+void MaskCsr3DGPUKernel(const GPUContext& dev_ctx,
+                        const DenseTensor& x,
+                        const SparseCsrTensor& mask,
+                        SparseCsrTensor* out) {
+  const DenseTensor& mask_cols = mask.cols();
+  const DenseTensor& mask_crows = mask.crows();
+  int64_t num_non_zeros = mask.nnz();
+
+  DenseTensor out_cols = phi::EmptyLike<IntT>(dev_ctx, mask_cols);
+  DenseTensor out_crows = phi::EmptyLike<IntT>(dev_ctx, mask_crows);
+  DenseTensor out_values = phi::Empty<T>(dev_ctx, {num_non_zeros});
+
+  phi::Copy(dev_ctx, mask_cols, dev_ctx.GetPlace(), false, &out_cols);
+  phi::Copy(dev_ctx, mask_crows, dev_ctx.GetPlace(), false, &out_crows);
+
+  const DDim& dims = x.dims();
+  const int64_t non_zero_num = mask.nnz();
+  int64_t sparse_dim = 3;
+  DenseTensor sparse_offsets = phi::Empty<IntT>(dev_ctx, {sparse_dim});
+  std::vector<int64_t> h_sparse_offsets(sparse_dim);
+  phi::funcs::sparse::CalcOffsetsPerDim(
+      dims, sparse_dim, h_sparse_offsets.data());
+
+  phi::backends::gpu::GpuMemcpyAsync(sparse_offsets.data<int64_t>(),
+                                     &h_sparse_offsets[0],
+                                     sizeof(int64_t) * sparse_dim,
+                                     gpuMemcpyHostToDevice,
+                                     dev_ctx.stream());
+
+  const auto& csr_crows = mask.crows();
+  const auto& csr_cols = mask.cols();
+  const IntT* csr_crows_data = csr_crows.data<IntT>();
+  const IntT* csr_cols_data = csr_cols.data<IntT>();
+
+  const int batches = dims[0];
+  const int rows = dims[1];
+  auto dims_2d = flatten_to_2d(dims, sparse_dim);
+  const int cols = dims_2d[1];
+
+  DenseTensor indices = phi::Empty<IntT>(dev_ctx, {sparse_dim, non_zero_num});
+  DenseTensor offsets = phi::Empty<IntT>(dev_ctx, {batches});
+  IntT* coo_indices = indices.data<IntT>();
+  IntT* batch_ptr = coo_indices;
+  IntT* coo_rows_data = batch_ptr + non_zero_num;
+  IntT* coo_cols_data = coo_rows_data + non_zero_num;
+  IntT* offsets_ptr = offsets.data<IntT>();
+
+  auto config_batch =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, batches, 1);
+  GetBatchSizes<IntT>
+      <<<config_batch.block_per_grid.x, config_batch.thread_per_block.x>>>(
+          csr_crows_data, rows, batches, offsets_ptr);
+
+#ifdef PADDLE_WITH_HIP
+  thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
+#else
+  thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                         offsets_ptr,
+                         offsets_ptr + batches,
+                         offsets_ptr);
+
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rows, 1);
+  config.block_per_grid.y = batches;
+  ConvertCsrCrowsToCooRows<IntT>
+      <<<config.block_per_grid, config.thread_per_block.x>>>(
+          csr_crows_data, offsets_ptr, coo_rows_data, batch_ptr, rows);
+  phi::backends::gpu::GpuMemcpyAsync(coo_cols_data,
+                                     csr_cols_data,
+                                     sizeof(IntT) * non_zero_num,
+                                     gpuMemcpyDeviceToDevice,
+                                     dev_ctx.stream());
+
+  const T* x_ptr = x.data<T>();
+  const IntT* indices_ptr = coo_indices;
+  T* out_values_ptr = out_values.data<T>();
+
+  auto config_mask =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num * cols, 1);
+  MaskKernel<T, IntT><<<config_mask.block_per_grid,
+                        config_mask.thread_per_block,
+                        0,
+                        dev_ctx.stream()>>>(x_ptr,
+                                            indices_ptr,
+                                            sparse_offsets.data<int64_t>(),
+                                            non_zero_num,
+                                            cols,
+                                            sparse_dim,
+                                            out_values_ptr);
+
+  out->SetMember(out_crows, out_cols, out_values, x.dims());
+}
+
 /**
  * @brief Filter the DenseTensor x by the
  * mask.indices() and output a SparseCooTensor
  * x and mask must have the same shape.
  **/
 template <typename T, typename Context>
-void MaskCooKernel(const Context& dev_ctx,
-                   const DenseTensor& x,
-                   const SparseCooTensor& mask,
-                   SparseCooTensor* out) {
+void MaskAsCooKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const SparseCooTensor& mask,
+                     SparseCooTensor* out) {
   PD_VISIT_BASE_INTEGRAL_TYPES(
       mask.indices().dtype(), "MaskCooGPUKernel", ([&] {
         MaskCooGPUKernel<T, data_t>(dev_ctx, x, mask, out);
       }));
 }
 
+/**
+ * @brief Filter the DenseTensor x by the
+ * mask.crows(), mask.cols() and output a SparseCsrTensor
+ * x and mask must have the same shape.
+ **/
+template <typename T, typename Context>
+void MaskAsCsrKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const SparseCsrTensor& mask,
+                     SparseCsrTensor* out) {
+  const phi::DDim& x_dims = x.dims();
+  if (x_dims.size() == 2) {
+    PD_VISIT_BASE_INTEGRAL_TYPES(
+        mask.crows().dtype(), "MaskCsr2DGPUKernel", ([&] {
+          MaskCsr2DGPUKernel<T, data_t>(dev_ctx, x, mask, out);
+        }));
+  } else if (x_dims.size() == 3) {
+    PD_VISIT_BASE_INTEGRAL_TYPES(
+        mask.crows().dtype(), "MaskCsr3DGPUKernel", ([&] {
+          MaskCsr3DGPUKernel<T, data_t>(dev_ctx, x, mask, out);
+        }));
+  } else {
+    // throw exception
+    phi::errors::InvalidArgument(
+        "mask_as for Sparse CSR Tensor only support 2-D or 3-D, but got "
+        "%d-D.",
+        x_dims.size());
+  }
+}
+
 template <typename IntT>
 __global__ void MaskTable(const IntT* x_indexs,
                           const int n,
@@ -296,10 +533,26 @@ void MaskHelperCooKernel(const Context& dev_ctx,
 }  // namespace sparse
 }  // namespace phi
 
-PD_REGISTER_KERNEL(mask_coo,
+PD_REGISTER_KERNEL(mask_helper_coo,
                    GPU,
                    ALL_LAYOUT,
-                   phi::sparse::MaskCooKernel,
+                   phi::sparse::MaskHelperCooKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
+
+PD_REGISTER_KERNEL(mask_as_coo,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::MaskAsCooKernel,
                    float,
                    double,
                    phi::dtype::float16,
@@ -314,18 +567,20 @@ PD_REGISTER_KERNEL(mask_coo,
   kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO);
 }
 
-PD_REGISTER_KERNEL(mask_helper_coo,
+PD_REGISTER_KERNEL(mask_as_csr,
                    GPU,
                    ALL_LAYOUT,
-                   phi::sparse::MaskHelperCooKernel,
+                   phi::sparse::MaskAsCsrKernel,
                    float,
                    double,
                    phi::dtype::float16,
                    uint8_t,
+                   int8_t,
                    int16_t,
                    int,
                    int64_t,
+                   bool,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {
-  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+  kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_CSR);
 }
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_conv_hashmap.cuh b/paddle/phi/kernels/sparse/gpu/sparse_conv_hashmap.cuh
index 73ad53de502da..380abb419b40a 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_conv_hashmap.cuh
+++ b/paddle/phi/kernels/sparse/gpu/sparse_conv_hashmap.cuh
@@ -65,7 +65,7 @@ class GPUHashTable {
   key_type* table_keys;
   val_type* table_vals;
   void insert_many_coords(const phi::GPUContext& dev_ctx, const int *coords, const int n);
-  void lookup_many_coords(const phi::GPUContext& dev_ctx, const int *coords, val_type *results, 
+  void lookup_many_coords(const phi::GPUContext& dev_ctx, const int *coords, val_type *results,
     const int* kernel_sizes, const int* tensor_strides,
     const int n, const int kernel_volume);
  public:
@@ -112,8 +112,8 @@ __global__ void insert_coords_kernel(key_type* table_keys, val_type* table_vals,
 
 template <typename key_type=int64_t, typename val_type=int, bool odd>
 __global__ void lookup_coords_kernel(
-  key_type* table_keys, val_type* table_vals, const int* coords, val_type* vals, 
-  const int* kernel_sizes, const int* strides, 
+  key_type* table_keys, val_type* table_vals, const int* coords, val_type* vals,
+  const int* kernel_sizes, const int* strides,
   int n, int _capacity, int kernel_volume, int _width)
 {
     int tidx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -125,8 +125,8 @@ __global__ void lookup_coords_kernel(
     //coords_out[2] = in_coords[2];
     //coords_out[3] = in_coords[3];
     coords_out[0] = in_coords[0];
-    
-    if constexpr (odd) 
+
+    if constexpr (odd)
     {
       #pragma unroll
       for(int i = 0; i <= _width-2; i++){
@@ -146,7 +146,7 @@ __global__ void lookup_coords_kernel(
         _kernel_idx /= kernel_sizes[i];
       }
     }
-    
+
     if (idx < n)
     {
         key_type key = (key_type)(hash_func_64b(coords_out, _width));
@@ -156,7 +156,7 @@ __global__ void lookup_coords_kernel(
         {
             key_type cur_key = table_keys[slot];
             if (key == cur_key)
-            { 
+            {
                 vals[idx * kernel_volume + kernel_idx] = table_vals[slot] - 1; // need to subtract 1 to avoid extra operations in python
             }
             if (table_keys[slot] == EMPTY_CELL)
@@ -181,7 +181,7 @@ void GPUHashTable<key_type, val_type>::insert_coords(const phi::GPUContext& dev_
 template <typename key_type, typename val_type>
 void GPUHashTable<key_type, val_type>::lookup_many_coords(
   const phi::GPUContext& dev_ctx,
-  const int* coords, val_type* results, 
+  const int* coords, val_type* results,
   const int* kernel_sizes, const int* strides,
   const int n, const int kernel_volume){
   if (kernel_volume % 2)
diff --git a/paddle/phi/kernels/sparse/gpu/sum_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/sum_grad_kernel.cu
index b0da1e7ab42f0..dc82d427c53c8 100644
--- a/paddle/phi/kernels/sparse/gpu/sum_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sum_grad_kernel.cu
@@ -54,7 +54,7 @@ __global__ void SumCsr3DGradCudaKernel(const int64_t* x_crows_data,
                                        const int64_t x_dim1,
                                        T* dx_values_data) {
   // dout_crows_data[index] should be equal to number;
-  CUDA_KERNEL_LOOP_TYPE(index, x_dim0 * (x_dim1 + 1), int64_t) {
+  CUDA_KERNEL_LOOP_TYPE(index, x_dim0 * (x_dim1 + 1) - 1, int64_t) {
     int64_t batch = index / (x_dim1 + 1);
     int64_t number = index % (x_dim1 + 1);
 
diff --git a/paddle/phi/kernels/sparse/gpu/sum_kernel.cu b/paddle/phi/kernels/sparse/gpu/sum_kernel.cu
index c9efc79e29b6c..29fc3a1d9b327 100644
--- a/paddle/phi/kernels/sparse/gpu/sum_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sum_kernel.cu
@@ -137,11 +137,16 @@ __global__ void SumCsr3DCudaKernel(const int64_t* x_crows_data,
                                    int64_t* out_crows_data,
                                    int64_t* out_cols_data,
                                    T* out_values_data) {
+  {
+    CUDA_KERNEL_LOOP_TYPE(index, x_dim0 * x_dim1, int64_t) {
+      out_cols_data[index] = 0;
+    }
+  }
+
   CUDA_KERNEL_LOOP_TYPE(index, x_dim0 * (x_dim1 + 1), int64_t) {
     int64_t batch = index / (x_dim1 + 1);
     int64_t number = index % (x_dim1 + 1);
     out_crows_data[index] = number;
-    out_cols_data[index] = 0;
 
     if (number != x_dim1) {
       T sum_value = 0;
@@ -154,6 +159,8 @@ __global__ void SumCsr3DCudaKernel(const int64_t* x_crows_data,
       for (int64_t j = x_crows_data[index]; j < x_crows_data[index + 1]; ++j) {
         sum_value += x_values_data[j + x_values_data_offset];
       }
+
+      // `index - batch` would never exceed x_dim0 * x_dim1.
       out_values_data[index - batch] = sum_value;
     }
   }
diff --git a/paddle/phi/kernels/sparse/mask_grad_kernel.h b/paddle/phi/kernels/sparse/mask_grad_kernel.h
new file mode 100644
index 0000000000000..687562aa300d1
--- /dev/null
+++ b/paddle/phi/kernels/sparse/mask_grad_kernel.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/core/sparse_csr_tensor.h"
+#include "paddle/phi/kernels/sparse/mask_kernel.h"
+#include "paddle/phi/kernels/sparse/sparse_utils_kernel.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename Context>
+void MaskAsCooGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const SparseCooTensor& mask,
+                         const SparseCooTensor& out_grad,
+                         DenseTensor* x_grad) {
+  CooToDenseKernel<T, Context>(dev_ctx, out_grad, x_grad);
+}
+
+template <typename T, typename Context>
+void MaskAsCsrGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const SparseCsrTensor& mask,
+                         const SparseCsrTensor& out_grad,
+                         DenseTensor* x_grad) {
+  CsrToDenseKernel<T, Context>(dev_ctx, out_grad, x_grad);
+}
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/mask_kernel.h b/paddle/phi/kernels/sparse/mask_kernel.h
index 5ffc7fb4aa44d..5be993e243b19 100644
--- a/paddle/phi/kernels/sparse/mask_kernel.h
+++ b/paddle/phi/kernels/sparse/mask_kernel.h
@@ -16,21 +16,28 @@ limitations under the License. */
 
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/core/sparse_csr_tensor.h"
 
 namespace phi {
 namespace sparse {
 
-template <typename T, typename Context>
-void MaskCooKernel(const Context& dev_ctx,
-                   const DenseTensor& x,
-                   const SparseCooTensor& mask,
-                   SparseCooTensor* out);
-
 template <typename T, typename Context>
 void MaskHelperCooKernel(const Context& dev_ctx,
                          const SparseCooTensor& x,
                          const DenseTensor& mask_indices,
                          DenseTensor* out);
 
+template <typename T, typename Context>
+void MaskAsCooKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const SparseCooTensor& mask,
+                     SparseCooTensor* out);
+
+template <typename T, typename Context>
+void MaskAsCsrKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const SparseCsrTensor& mask,
+                     SparseCsrTensor* out);
+
 }  // namespace sparse
 }  // namespace phi
diff --git a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc
index f5915c7acb84c..4933aac3c23ec 100644
--- a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc
@@ -16,8 +16,7 @@ limitations under the License. */
 
 #include "paddle/phi/core/kernel_registry.h"
 
-namespace phi {
-namespace sparse {
+namespace phi::sparse {
 
 template <typename T, typename Context>
 void ValuesCooGradKernel(const Context& dev_ctx UNUSED,
@@ -32,11 +31,10 @@ void CooToDenseGradKernel(const Context& dev_ctx,
                           const SparseCooTensor& x,
                           const DenseTensor& out_grad,
                           SparseCooTensor* x_grad) {
-  MaskCooKernel<T, Context>(dev_ctx, out_grad, x, x_grad);
+  MaskAsCooKernel<T, Context>(dev_ctx, out_grad, x, x_grad);
 }
 
-}  // namespace sparse
-}  // namespace phi
+}  // namespace phi::sparse
 
 PD_REGISTER_KERNEL(values_coo_grad,
                    CPU,
diff --git a/paddle/phi/kernels/stride/slice_kernel.cc b/paddle/phi/kernels/stride/slice_kernel.cc
index b5efcd49166fd..f4ff64b5cd2af 100644
--- a/paddle/phi/kernels/stride/slice_kernel.cc
+++ b/paddle/phi/kernels/stride/slice_kernel.cc
@@ -33,7 +33,7 @@ void SliceStridedKernel(const Context& ctx,
                         DenseTensor* out) {
   std::vector<int64_t> starts = starts_arr.GetData();
   std::vector<int64_t> ends = ends_arr.GetData();
-  auto in_dims = input.dims();
+  const auto& in_dims = input.dims();
 
   auto new_axes = axes;
   for (auto& item : new_axes) {
diff --git a/paddle/phi/kernels/strings/cpu/strings_copy_kernel.cc b/paddle/phi/kernels/strings/cpu/strings_copy_kernel.cc
index da8fba85accf9..06bbe8c15903a 100644
--- a/paddle/phi/kernels/strings/cpu/strings_copy_kernel.cc
+++ b/paddle/phi/kernels/strings/cpu/strings_copy_kernel.cc
@@ -18,8 +18,7 @@ limitations under the License. */
 #include "paddle/phi/common/pstring.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-namespace phi {
-namespace strings {
+namespace phi::strings {
 
 template <typename Context>
 void Copy(const Context& dev_ctx,
@@ -50,8 +49,7 @@ void Copy(const Context& dev_ctx,
   }
 }
 
-}  // namespace strings
-}  // namespace phi
+}  // namespace phi::strings
 
 PD_REGISTER_KERNEL_FOR_ALL_DTYPE(strings_copy,
                                  CPU,
diff --git a/paddle/phi/kernels/strings/cpu/strings_lower_upper_kernel.cc b/paddle/phi/kernels/strings/cpu/strings_lower_upper_kernel.cc
index b470f3b211f6a..ec3b2b731d7e6 100644
--- a/paddle/phi/kernels/strings/cpu/strings_lower_upper_kernel.cc
+++ b/paddle/phi/kernels/strings/cpu/strings_lower_upper_kernel.cc
@@ -17,8 +17,7 @@ limitations under the License. */
 
 using pstring = ::phi::dtype::pstring;
 
-namespace phi {
-namespace strings {
+namespace phi::strings {
 
 template <typename ContextT>
 void StringLowerKernel(const ContextT& dev_ctx,
@@ -40,8 +39,7 @@ void StringUpperKernel(const ContextT& dev_ctx,
                           ContextT>()(dev_ctx, x, use_utf8_encoding, out);
 }
 
-}  // namespace strings
-}  // namespace phi
+}  // namespace phi::strings
 
 PD_REGISTER_KERNEL_FOR_ALL_DTYPE(
     strings_lower,
diff --git a/paddle/phi/kernels/xpu/plugin/build.sh b/paddle/phi/kernels/xpu/plugin/build.sh
index 65228c101d354..3b57efba50f38 100755
--- a/paddle/phi/kernels/xpu/plugin/build.sh
+++ b/paddle/phi/kernels/xpu/plugin/build.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/paddle/phi/kernels/xpu/plugin/example/build.sh b/paddle/phi/kernels/xpu/plugin/example/build.sh
index d96636707d15a..a54277c769540 100755
--- a/paddle/phi/kernels/xpu/plugin/example/build.sh
+++ b/paddle/phi/kernels/xpu/plugin/example/build.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/paddle/phi/kernels/xpu/plugin/example/run.sh b/paddle/phi/kernels/xpu/plugin/example/run.sh
index 25b4a9dbd244e..ae41223f79bcb 100755
--- a/paddle/phi/kernels/xpu/plugin/example/run.sh
+++ b/paddle/phi/kernels/xpu/plugin/example/run.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/paddle/phi/kernels/xpu/rms_norm_kernel.cc b/paddle/phi/kernels/xpu/rms_norm_kernel.cc
index 698b2b195da82..85a4ea7291a14 100644
--- a/paddle/phi/kernels/xpu/rms_norm_kernel.cc
+++ b/paddle/phi/kernels/xpu/rms_norm_kernel.cc
@@ -63,10 +63,10 @@ void RmsNormKernel(const Context& dev_ctx,
   const T* norm_weight_data = norm_weight.data<T>();
   const T* norm_bias_data = norm_bias ? norm_bias.get().data<T>() : nullptr;
   // float* inv_var_data = nullptr;
-  if (inv_var != nullptr) {
-    // inv_var_data = dev_ctx.template Alloc<float>(inv_var);
-    PD_THROW("rms_norm in XPU kernel does not support inv_var output");
-  }
+  // if (inv_var != nullptr) {
+  // inv_var_data = dev_ctx.template Alloc<float>(inv_var);
+  // PD_THROW("rms_norm in XPU kernel does not support inv_var output");
+  // }
 
   int32_t rows = 1;
   int32_t cols = 1;
diff --git a/paddle/phi/kernels/xpu/swiglu_kernel.cc b/paddle/phi/kernels/xpu/swiglu_kernel.cc
index a7815931fa6a8..9ba9c10ea1a43 100644
--- a/paddle/phi/kernels/xpu/swiglu_kernel.cc
+++ b/paddle/phi/kernels/xpu/swiglu_kernel.cc
@@ -50,7 +50,7 @@ void SwiGluKernel(const Context& ctx,
                         reinterpret_cast<XPUType*>(z_data),
                         dims_vec,
                         axis,
-                        false,
+                        true,
                         const_nullptr,
                         nullptr,
                         y_ptr);
diff --git a/paddle/phi/kernels/xpu/swiglu_kernel_grad.cc b/paddle/phi/kernels/xpu/swiglu_kernel_grad.cc
index 994699a9fa63a..290081a48f36d 100644
--- a/paddle/phi/kernels/xpu/swiglu_kernel_grad.cc
+++ b/paddle/phi/kernels/xpu/swiglu_kernel_grad.cc
@@ -64,7 +64,7 @@ void SwiGluGradKernel(const Context& ctx,
                              reinterpret_cast<XPUType*>(dx_data),
                              dims_vec,
                              axis,
-                             false,
+                             true,
                              y_ptr,
                              dy_ptr);
   PADDLE_ENFORCE_XDNN_SUCCESS(ret, "swiglu_grad");
diff --git a/paddle/phi/kernels/xpu/tile_kernel.cc b/paddle/phi/kernels/xpu/tile_kernel.cc
index 5e665711efc8d..6b8dbf641f803 100644
--- a/paddle/phi/kernels/xpu/tile_kernel.cc
+++ b/paddle/phi/kernels/xpu/tile_kernel.cc
@@ -143,4 +143,5 @@ PD_REGISTER_KERNEL(tile,
                    double,
                    int,
                    int64_t,
-                   phi::dtype::bfloat16) {}
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/ops/yaml/backward.yaml b/paddle/phi/ops/yaml/backward.yaml
index 934e55ad90a92..69a737eebaaa8 100644
--- a/paddle/phi/ops/yaml/backward.yaml
+++ b/paddle/phi/ops/yaml/backward.yaml
@@ -1067,6 +1067,7 @@
   infer_meta :
     func :  KernelWithXShapeInferMeta
     param : [xshape, out_grad]
+    spmd_rule : FlattenGradInferSpmd
   kernel :
     func : flatten_grad
     data_type : out_grad
@@ -1825,6 +1826,33 @@
   kernel :
     func : logsumexp_grad
 
+- backward_op : lp_pool2d_grad
+  forward : lp_pool2d(Tensor x, IntArray kernel_size, int[] strides = {1,1}, int[] paddings = {0,0}, bool ceil_mode = false, bool exclusive = true, str data_format = "NCHW", str pooling_type = "", bool global_pooling = false, bool adaptive = false, str padding_algorithm = "EXPLICIT", float norm_type = 0.0f) -> Tensor(out)
+  args : (Tensor x, Tensor out, Tensor out_grad, IntArray kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm, float norm_type)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param: [x]
+  kernel :
+    func : lp_pool2d_grad
+    param : [x, out, out_grad, kernel_size, strides, paddings, ceil_mode, exclusive, data_format, pooling_type, global_pooling, adaptive, padding_algorithm, norm_type]
+
+- backward_op : lstm_grad
+  forward: lstm (Tensor input, Tensor h0, Tensor c0, Tensor weight, Tensor bias, bool use_peepholes
+    = true, bool is_reverse = false, bool is_test = false, str gate_activation = "sigmoid",
+    str cell_activation = "tanh", str candidate_activation = "tanh") -> Tensor (hidden), Tensor (cell), Tensor (batch_gate), Tensor (batch_cell_pre_act)
+  args: (Tensor input, Tensor h0, Tensor c0, Tensor weight, Tensor bias, Tensor hidden, Tensor cell,
+    Tensor batch_gate, Tensor batch_cell_pre_act, Tensor hidden_grad, bool use_peepholes, bool is_reverse, bool is_test, str gate_activation,
+    str cell_activation, str candidate_activation)
+  output: Tensor(input_grad), Tensor(h0_grad), Tensor(c0_grad), Tensor(weight_grad), Tensor(bias_grad)
+  infer_meta:
+    func: LSTMGradInferMeta
+    param: [input, h0, c0, weight, bias]
+  kernel:
+    func: lstm_grad
+    data_type: input
+  optional: h0, c0
+
 - backward_op : lu_grad
   forward : lu (Tensor x, bool pivot = true) -> Tensor(out), Tensor(pivots), Tensor(infos)
   args : (Tensor x, Tensor out, Tensor pivots, Tensor out_grad, bool pivot)
@@ -3193,8 +3221,8 @@
     func : tensor_unfold_grad
 
 - backward_op : thresholded_relu_grad
-  forward : thresholded_relu (Tensor x, float threshold) -> Tensor(out)
-  args : (Tensor x, Tensor out_grad, float threshold)
+  forward : thresholded_relu (Tensor x, float threshold, float value) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad, float threshold, float value)
   output : Tensor(x_grad)
   infer_meta :
     func : UnchangedInferMeta
@@ -3490,6 +3518,16 @@
     func: pyramid_hash_grad
     data_type: w
 
+- backward_op: shuffle_batch_grad
+  forward: shuffle_batch (Tensor x, Tensor seed, int startup_seed=0) -> Tensor(out), Tensor(shuffle_idx), Tensor(seed_out)
+  args: (Tensor shuffle_idx, Tensor out_grad,int startup_seed=0)
+  output : Tensor(x_grad)
+  infer_meta:
+    func: ShuffleBatchGradInferMeta
+  kernel:
+    func: shuffle_batch_grad
+    data_type : out_grad
+
 - backward_op: silu_double_grad
   forward: silu_grad (Tensor x, Tensor out, Tensor grad_out) -> Tensor(grad_x)
   args: (Tensor x, Tensor out, Tensor grad_out, Tensor grad_x_grad)
diff --git a/paddle/phi/ops/yaml/fused_backward.yaml b/paddle/phi/ops/yaml/fused_backward.yaml
index 235864c4c9d8b..3bd2673fab016 100644
--- a/paddle/phi/ops/yaml/fused_backward.yaml
+++ b/paddle/phi/ops/yaml/fused_backward.yaml
@@ -40,6 +40,29 @@
     data_type : out_grad
   support_dygraph_mode : true
 
+- backward_op : fused_elemwise_activation_grad
+  forward: fused_elemwise_activation (Tensor x, Tensor y, str[] functor_list, int axis = -1, float scale = 0.0, bool save_intermediate_out
+    = false) -> Tensor (out), Tensor (intermediate_out)
+  args: (Tensor x, Tensor y, Tensor out, Tensor intermediate_out, Tensor out_grad, str[] functor_list, int axis = -1, float scale = 0.0, bool save_intermediate_out = false)
+  output: Tensor (x_grad), Tensor (y_grad)
+  infer_meta:
+    func: FusedElemwiseActivationGradInferMeta
+  kernel:
+    func: fused_elemwise_activation_grad
+    data_type: out_grad
+
+- backward_op : fused_elemwise_add_activation_grad
+  forward: fused_elemwise_add_activation (Tensor x, Tensor y, str[] functor_list, int axis = -1, float scale = 0.0, bool save_intermediate_out = false) -> Tensor (out), Tensor (intermediate_out)
+  args: (Tensor x, Tensor y, Tensor out, Tensor intermediate_out, Tensor out_grad, str[] functor_list, int axis = -1, float scale = 0.0, bool save_intermediate_out = false)
+  output: Tensor (x_grad), Tensor (y_grad)
+  infer_meta:
+    func: FusedElemwiseActivationGradInferMeta
+  kernel:
+    func: fused_elemwise_add_activation_grad
+    data_type: out_grad
+  optional: x, intermediate_out
+  no_need_buffer: x, y
+
 - backward_op : fused_rotary_position_embedding_grad
   forward: fused_rotary_position_embedding (Tensor q, Tensor k, Tensor v, Tensor sin, Tensor cos, Tensor position_ids, bool use_neox_rotary_style, bool time_major, float rotary_emb_base) -> Tensor(out_q), Tensor(out_k), Tensor(out_v)
   args : (Tensor sin, Tensor cos, Tensor position_ids, Tensor out_q_grad, Tensor out_k_grad,Tensor out_v_grad, bool use_neox_rotary_style, bool time_major, float rotary_emb_base)
diff --git a/paddle/phi/ops/yaml/fused_ops.yaml b/paddle/phi/ops/yaml/fused_ops.yaml
index 5db39e9d207d7..3c244b6f4625d 100644
--- a/paddle/phi/ops/yaml/fused_ops.yaml
+++ b/paddle/phi/ops/yaml/fused_ops.yaml
@@ -56,6 +56,20 @@
   data_transform :
     skip_transform : max_enc_len_this_time, max_dec_len_this_time
 
+- op : block_multihead_attention_xpu
+  args : (Tensor qkv, Tensor key_cache, Tensor value_cache, Tensor seq_lens_encoder, Tensor seq_lens_decoder, Tensor seq_lens_this_time, Tensor padding_offsets, Tensor cum_offsets, Tensor cu_seqlens_q, Tensor cu_seqlens_k, Tensor block_tables, Tensor cache_k_per_batch_maxs, Tensor cache_v_per_batch_maxs, Tensor pre_key_cache, Tensor pre_value_cache, Tensor rope_emb, Tensor mask,  Tensor tgt_mask, Tensor cache_k_quant_scales, Tensor cache_v_quant_scales, Tensor cache_k_dequant_scales, Tensor cache_v_dequant_scales, Tensor qkv_out_scale, Tensor qkv_bias, Tensor out_shift, Tensor out_smooth, Tensor max_enc_len_this_time, Tensor max_dec_len_this_time, int max_seq_len, int block_size, bool use_neox_style, bool dynamic_cachekv_quant=false, int quant_round_type=1, float quant_max_bound=127.0, float quant_min_bound=-127.0, float out_scale=-1, str compute_dtype = "default")
+  output : Tensor(fmha_out), Tensor(qkv_out), Tensor(key_cache_out), Tensor(value_cache_out)
+  infer_meta :
+    func : BlockMultiheadAttentionInferXPUMeta
+  kernel :
+    func : block_multihead_attention_xpu
+    data_type : qkv
+  optional : pre_key_cache, pre_value_cache, rope_emb, mask, tgt_mask, cache_k_quant_scales, cache_v_quant_scales, cache_k_dequant_scales, cache_v_dequant_scales, qkv_out_scale, qkv_bias, out_shift, out_smooth, max_enc_len_this_time, max_dec_len_this_time
+  inplace : (qkv -> qkv_out), (key_cache -> key_cache_out), (value_cache -> value_cache_out)
+  support_dygraph_mode : true
+  data_transform :
+    skip_transform : max_enc_len_this_time, max_dec_len_this_time
+
 - op : bn_act_xpu
   args : (Tensor x, Tensor mean, Tensor variance, Tensor scale, Tensor bias, float momentum, float epsilon, str data_format, int act_type)
   output : Tensor(out)
@@ -307,6 +321,28 @@
     data_type : x
   support_dygraph_mode : true
 
+- op : fused_elemwise_activation
+  args: (Tensor x, Tensor y, str[] functor_list, int axis = -1, float scale = 0.0, bool save_intermediate_out
+    = false)
+  output: Tensor (out), Tensor (intermediate_out)
+  infer_meta:
+    func: FusedElemwiseActivationInferMeta
+  kernel:
+    func: fused_elemwise_activation
+    data_type: x
+  intermediate: intermediate_out
+  backward: fused_elemwise_activation_grad
+
+- op : fused_elemwise_add_activation
+  args: (Tensor x, Tensor y, str[] functor_list, int axis = -1, float scale = 0.0, bool save_intermediate_out = false)
+  output: Tensor(out), Tensor(intermediate_out)
+  kernel:
+    func: fused_elemwise_add_activation
+  infer_meta:
+    func : FusedElemwiseActivationInferMeta
+  backward: fused_elemwise_add_activation_grad
+  intermediate: intermediate_out
+
 - op : fused_embedding_eltwise_layernorm
   args : (Tensor[] ids, Tensor[] embs, Tensor bias, Tensor scale, float epsilon = 0.00001f)
   output : Tensor(out)
@@ -400,6 +436,16 @@
     func: fused_token_prune
   support_dygraph_mode : true
 
+- op : fusion_group
+  args: (Tensor[] inputs, int[] outs_dtype = {}, int[] inputs_dtype = {}, str func_name = "", int type
+    = 0)
+  output: Tensor[] (outs){inputs.size()}
+  infer_meta:
+    func: FusionGroupInferMeta
+  kernel:
+    func: fusion_group
+    data_type : DataType::FLOAT32
+
 - op : fusion_gru
   args : (Tensor x, Tensor h0, Tensor weight_x, Tensor weight_h, Tensor bias, str activation = "tanh", str gate_activation = "sigmoid", bool is_reverse = false, bool use_seq = true, bool origin_mode = false, bool force_fp32_output = false)
   output : Tensor(reordered_h0), Tensor(xx), Tensor(batched_input), Tensor(batched_out), Tensor(hidden)
@@ -411,6 +457,17 @@
   optional : h0, bias
   intermediate : reordered_h0, xx, batched_input, batched_out
 
+- op : fusion_lstm
+  args : (Tensor x, Tensor weight_x, Tensor weight_h, Tensor bias, Tensor h0, Tensor c0, bool use_peepholes=true, bool is_reverse=false, bool use_seq=true, str gate_activation="sigmoid", str cell_activation="tanh", str candidate_activation="tanh", float scale_data=1.0, float shift_data=0.0, float[] scale_weights={1.0}, bool force_fp32_output=false)
+  output : Tensor(hidden), Tensor(cell), Tensor(xx), Tensor(batched_input), Tensor(batched_hidden), Tensor(batched_cell), Tensor(reordered_h0), Tensor(reordered_c0), Tensor(checked_cell)
+  infer_meta :
+    func : FusionLstmInferMeta
+  kernel :
+    func : fusion_lstm
+    data_type : x
+  optional : h0, c0
+  intermediate : xx, batched_input, batched_hidden, batched_cell, reordered_h0, reordered_c0, checked_cell
+
 - op : fusion_repeated_fc_relu
   args : (Tensor x, Tensor[] w, Tensor[] bias)
   output : Tensor[](relu_out){w.size()-1}, Tensor(out)
@@ -685,3 +742,15 @@
     func : yolo_box_xpu
     data_type : x
   optional : x_max
+
+- op: add_group_norm_silu
+  args : (Tensor x,Tensor residual, Tensor scale, Tensor bias, float epsilon = 1e-5, int groups = -1, str data_format = "NCHW", str activation = "")
+  output : Tensor(y), Tensor(residual_out), Tensor(mean), Tensor(variance)
+  infer_meta :
+    func : AddGroupNormSiluInferMeta
+  kernel :
+    func : add_group_norm_silu
+    data_type : x
+  optional : scale, bias, residual, residual_out
+  support_dygraph_mode : true
+  interfaces : paddle::dialect::LayoutTransformationInterface
diff --git a/paddle/phi/ops/yaml/inconsistent/onednn_static.yaml b/paddle/phi/ops/yaml/inconsistent/onednn_static.yaml
index 282dd35cb3453..386eadf0c1dc6 100644
--- a/paddle/phi/ops/yaml/inconsistent/onednn_static.yaml
+++ b/paddle/phi/ops/yaml/inconsistent/onednn_static.yaml
@@ -91,17 +91,6 @@
   kernel :
     func : fused_transpose
 
-- op : fusion_lstm
-  args : (Tensor x, Tensor weight_x, Tensor weight_h, Tensor bias, Tensor h0, Tensor c0, bool use_peepholes=true, bool is_reverse=false, bool use_seq=true, str gate_activation="sigmoid", str cell_activation="tanh", str candidate_activation="tanh", float scale_data=1.0, float shift_data=0.0, float[] scale_weights={1.0}, bool force_fp32_output=false)
-  output : Tensor(hidden), Tensor(cell), Tensor(xx), Tensor(batched_input), Tensor(batched_hidden), Tensor(batched_cell), Tensor(reordered_h0), Tensor(reordered_c0), Tensor(checked_cell)
-  infer_meta :
-    func : FusionLstmInferMeta
-  kernel :
-    func : fusion_lstm
-    data_type : x
-  optional : h0, c0
-  intermediate : xx, batched_input, batched_hidden, batched_cell, reordered_h0, reordered_c0, checked_cell
-
 - op: multi_gru
   args: (Tensor x, Tensor[] weight_x, Tensor[] weight_h, Tensor[] bias, Tensor[] scale_weights, str activation="tanh", str gate_activation="sigmoid", int layers=1, bool origin_mode=false, str mkldnn_data_type="float32", float scale_data=1.0, float shift_data=1.0, bool force_fp32_output=false)
   output: Tensor(hidden)
diff --git a/paddle/phi/ops/yaml/inconsistent/static_backward.yaml b/paddle/phi/ops/yaml/inconsistent/static_backward.yaml
index f408cece8e006..5a9c9a66a2e75 100644
--- a/paddle/phi/ops/yaml/inconsistent/static_backward.yaml
+++ b/paddle/phi/ops/yaml/inconsistent/static_backward.yaml
@@ -485,16 +485,6 @@
   composite : tile_grad(x, out_grad, repeat_times, x_grad)
   backward : tile_double_grad
 
-- backward_op: fused_elemwise_add_activation_grad
-  forward: fused_elemwise_add_activation(Tensor x, Tensor y, str[] functor_list, float scale=0.0, int axis=-1, bool save_intermediate_out=false) -> Tensor(out), Tensor(intermediate_out)
-  args: (Tensor x, Tensor y, Tensor out, Tensor intermediate_out, Tensor out_grad, str[] functor_list, float scale=0.0, int axis=-1, bool save_intermediate_out=false)
-  output: Tensor(x_grad), Tensor(y_grad)
-  infer_meta:
-    func: FusedElemwiseAddActivationGradInferMeta
-  kernel:
-    func: fused_elemwise_add_activation_grad
-  optional : x, intermediate_out
-
 - backward_op: match_matrix_tensor_grad
   forward: match_matrix_tensor (Tensor x, Tensor y, Tensor w, int dim_t=1) ->  Tensor(out), Tensor(tmp)
   args: (Tensor x, Tensor y, Tensor w, Tensor tmp, Tensor out_grad, int dim_t=1)
@@ -503,13 +493,3 @@
     func: MatchMatrixTensorGradInferMeta
   kernel:
     func: match_matrix_tensor_grad
-
-- backward_op: shuffle_batch_grad
-  forward: shuffle_batch (Tensor x, Tensor seed, int startup_seed=0) -> Tensor(out), Tensor(shuffle_idx), Tensor(seed_out)
-  args: (Tensor shuffle_idx, Tensor out_grad,int startup_seed=0)
-  output : Tensor(x_grad)
-  infer_meta:
-    func: ShuffleBatchGradInferMeta
-  kernel:
-    func: shuffle_batch_grad
-    data_type : out_grad
diff --git a/paddle/phi/ops/yaml/inconsistent/static_ops.yaml b/paddle/phi/ops/yaml/inconsistent/static_ops.yaml
index ddfe98cefcc80..a5921bb3a039a 100644
--- a/paddle/phi/ops/yaml/inconsistent/static_ops.yaml
+++ b/paddle/phi/ops/yaml/inconsistent/static_ops.yaml
@@ -49,14 +49,6 @@
   inplace : (x -> out)
   interfaces : paddle::dialect::InferSymbolicShapeInterface, paddle::dialect::LayoutTransformationInterface
 
-- op : assign_pos
-  args : (Tensor x, Tensor cum_count, Tensor eff_num_len)
-  output : Tensor(out)
-  infer_meta :
-    func : AssignPosInferMeta
-  kernel :
-    func : assign_pos
-
 - op : assign_value
   args : (int[] shape, DataType dtype, Scalar[] values, Place place = {})
   output : Tensor(out)
@@ -196,15 +188,6 @@
     data_type : dtype
   inplace: (input -> output)
 
-- op : decayed_adagrad
-  args : (Tensor param, Tensor grad, Tensor moment, Tensor learning_rate, float decay = 0.95f, float epsilon = 1.0e-6f)
-  output : Tensor(param_out), Tensor(moment_out)
-  infer_meta :
-    func : DecayedAdagradInferMeta
-  kernel :
-    func : decayed_adagrad
-    data_type : param
-
 - op : dequantize_linear
   args : (Tensor x, Tensor scale, Tensor zero_point, Tensor in_accum, Tensor in_state, int quant_axis = 0, int bit_length = 8, int round_type = 0, bool is_test = true, bool only_observer = false)
   output : Tensor(y), Tensor(out_state), Tensor(out_accum), Tensor(out_scale)
@@ -608,15 +591,6 @@
   interfaces : paddle::dialect::InferSymbolicShapeInterface
   traits : pir::SideEffectTrait
 
-- op : prune_gate_by_capacity
-  args : (Tensor gate_idx, Tensor expert_count, int64_t n_expert, int64_t n_worker)
-  output : Tensor(new_gate_idx)
-  infer_meta :
-    func : PruneGateByCapacityInferMeta
-  kernel :
-    func : prune_gate_by_capacity
-    data_type : gate_idx
-
 - op : pull_box_sparse
   args : (Tensor w, Tensor[] ids, bool is_sparse = false, bool is_distributed = false, int size = 1)
   output : Tensor[](out){ids.size()}
@@ -812,17 +786,6 @@
     param: [x]
   inplace : (x -> out)
 
-- op : shuffle_batch
-  args : (Tensor x, Tensor seed, int startup_seed=0)
-  output : Tensor(out), Tensor(shuffle_idx), Tensor(seed_out)
-  infer_meta:
-     func: ShuffleBatchInferMeta
-  kernel:
-     func: shuffle_batch
-     data_type: x
-  backward : shuffle_batch_grad
-  traits : pir::SideEffectTrait
-
 - op : soft_relu
   args : (Tensor x, float threshold = 20.0f)
   output : Tensor(out)
@@ -859,16 +822,6 @@
   backward : subtract_grad
   interfaces : paddle::dialect::InferSymbolicShapeInterface
 
-- op : tdm_sampler
-  args: (Tensor x, Tensor travel, Tensor layer, bool output_positive=true, int[] neg_samples_num_list={}, int[] layer_offset_lod={}, int seed = 0, int dtype=2)
-  output: Tensor(out), Tensor(labels), Tensor(mask)
-  infer_meta:
-    func : TdmSamplerInferMeta
-  kernel:
-    func : tdm_sampler
-    data_type : x
-  optional : labels
-
 - op : tile
   args : (Tensor x, IntArray repeat_times = {})
   output : Tensor(out)
@@ -935,16 +888,6 @@
   optional: cache_kv, ln_scale, ln_bias, qkv_bias, src_mask, out_linear_bias, ln_scale_2, ln_bias_2, ln_mean_2, ln_var_2, bias_dropout_residual_out, cache_kv_out
   backward: fused_attention_grad
 
-- op: fused_elemwise_add_activation
-  args: (Tensor x, Tensor y, str[] functor_list, float scale=0.0, int axis=-1, bool save_intermediate_out=false)
-  output: Tensor(out), Tensor(intermediate_out)
-  kernel:
-    func: fused_elemwise_add_activation
-  infer_meta:
-    func : FusedElemwiseAddActivationInferMeta
-  backward: fused_elemwise_add_activation_grad
-  intermediate: intermediate_out
-
 - op: fused_feedforward
   args: (Tensor x, Tensor dropout1_seed, Tensor dropout2_seed, Tensor linear1_weight, Tensor linear1_bias, Tensor linear2_weight, Tensor linear2_bias, Tensor ln1_scale, Tensor ln1_bias, Tensor ln2_scale, Tensor ln2_bias, bool pre_layer_norm, float ln1_epsilon, float ln2_epsilon, str act_method, float dropout1_prob, float dropout2_prob, str dropout1_implementation, str dropout2_implementation, bool is_test, bool dropout1_fix_seed, bool dropout2_fix_seed, int dropout1_seed_val, int dropout2_seed_val, bool add_residual, int ring_id)
   output: Tensor(out), Tensor(dropout1_mask), Tensor(dropout2_mask), Tensor(ln1_mean), Tensor(ln1_variance), Tensor(ln2_mean), Tensor(ln2_variance), Tensor(linear1_out), Tensor(ln1_out), Tensor(dropout1_out), Tensor(dropout2_out)
@@ -1001,15 +944,6 @@
   optional: bias, sample_weight, custom_dist_probs, custom_dist_alias, custom_dist_alias_probs
   backward: nce_grad
 
-- op: number_count
-  args: (Tensor numbers, int upper_range)
-  output: Tensor(out)
-  infer_meta:
-     func: NumberCountInferMeta
-  kernel:
-     func: number_count
-     data_type: numbers
-
 - op: onednn_to_paddle_layout
   args: (Tensor x, int dst_layout)
   output: Tensor(out)
diff --git a/paddle/phi/ops/yaml/legacy/backward_exclude.yaml b/paddle/phi/ops/yaml/legacy/backward_exclude.yaml
index 335952bc3475c..9a327ef5dd4b3 100644
--- a/paddle/phi/ops/yaml/legacy/backward_exclude.yaml
+++ b/paddle/phi/ops/yaml/legacy/backward_exclude.yaml
@@ -5,7 +5,6 @@
 - amax_grad
 - amin_grad
 - cast_grad
-- channel_shuffle_grad
 - conv2d_transpose_double_grad
 - conv2d_transpose_grad
 - deformable_conv_grad
@@ -34,7 +33,6 @@
 - repeat_interleave_grad
 - repeat_interleave_with_tensor_index_grad
 - rnn_grad
-- rrelu_grad
 - set_value_with_tensor_grad
 - slice_double_grad
 - slice_grad
diff --git a/paddle/phi/ops/yaml/legacy/ops_exclude.yaml b/paddle/phi/ops/yaml/legacy/ops_exclude.yaml
index 160e33c5b36c8..703c948240df0 100644
--- a/paddle/phi/ops/yaml/legacy/ops_exclude.yaml
+++ b/paddle/phi/ops/yaml/legacy/ops_exclude.yaml
@@ -22,7 +22,6 @@
 - c_sync_calc_stream
 - c_sync_comm_stream
 - cast
-- channel_shuffle
 - conv2d_transpose
 - conv2d_transpose_bias
 - copy_to
@@ -75,7 +74,6 @@
 - repeat_interleave
 - repeat_interleave_with_tensor_index
 - rnn
-- rrelu
 - sequence_mask
 - set_value_with_tensor
 - slice
diff --git a/paddle/phi/ops/yaml/legacy/static_ops.yaml b/paddle/phi/ops/yaml/legacy/static_ops.yaml
index 1280fd3716f0a..d9d0c222b770f 100755
--- a/paddle/phi/ops/yaml/legacy/static_ops.yaml
+++ b/paddle/phi/ops/yaml/legacy/static_ops.yaml
@@ -699,6 +699,14 @@
     func : swish
   backward : swish_grad
 
+- op : transfer_layout
+  args: (Tensor x, int src_layout = -1, int dst_layout=-1)
+  output: Tensor (out)
+  infer_meta:
+    func: TransferLayoutInferMeta
+  kernel:
+    func: transfer_layout
+
 - op : tril_indices
   args : (int rows = 0, int cols = 0, int offset = 0, DataType dtype = DataType::INT64)
   output : Tensor(out)
diff --git a/paddle/phi/ops/yaml/op_compat.yaml b/paddle/phi/ops/yaml/op_compat.yaml
index 2f7af0b64c802..4c6d111f0f4a6 100755
--- a/paddle/phi/ops/yaml/op_compat.yaml
+++ b/paddle/phi/ops/yaml/op_compat.yaml
@@ -4028,6 +4028,13 @@
   outputs:
     {out: Out}
 
+- op: fused_elemwise_activation
+  backward: fused_elemwise_activation_grad
+  inputs:
+    {x : X, y : Y}
+  outputs:
+    {out : Out, intermediate_out : IntermediateOut}
+
 - op: fused_elemwise_add_activation
   backward: fused_elemwise_add_activation_grad
   inputs :
@@ -4035,6 +4042,13 @@
   outputs :
     {out : Out, intermediate_out : IntermediateOut}
 
+- op: fused_elemwise_add_activation
+  backward: fused_elemwise_add_activation_grad
+  inputs:
+    {x : X, y : Y}
+  outputs:
+    {out : Out, intermediate_out : IntermediateOut}
+
 - op: fused_matmul
   inputs :
     {x: X, y: Y, residual_data: ResidualData}
@@ -4062,6 +4076,12 @@
   outputs :
     {slimmed_x : SlimmedX, cls_inds : CLSInds}
 
+- op: fusion_group
+  inputs:
+    inputs : Inputs
+  outputs:
+    outs : Outs
+
 - op: fusion_seqpool_cvm_concat
   inputs:
     {x : X, cvm : CVM}
@@ -4129,6 +4149,15 @@
   outputs:
     {out: Out}
 
+- op: lstm
+  backward: lstm_grad
+  inputs:
+    {input : Input, h0 : H0, c0 : C0, weight : Weight, bias : Bias}
+  outputs:
+    {hidden : Hidden, cell : Cell, batch_gate : BatchGate, batch_cell_pre_act : BatchCellPreAct}
+  extra:
+    outputs: [batch_gate, batch_cell_pre_act]
+
 - op: lu
   backward: lu_grad
   inputs:
@@ -4195,7 +4224,7 @@
   inputs:
     {gate_idx: GateIdx, expert_count: ExpertCount}
   outputs:
-    new_gate_idx: NewGateIdx
+    out_gate_idx: NewGateIdx
 
 - op: pyramid_hash
   backward: pyramid_hash_grad
@@ -4250,6 +4279,8 @@
     {x: X}
   outputs:
     {out: Out, noise: Noise}
+  extra:
+    outputs: [noise]
 
 - op: send_v2
   inputs :
@@ -4355,6 +4386,12 @@
   outputs :
     out : Out
 
+- op: transfer_layout
+  inputs:
+    x : X
+  outputs:
+    out : Out
+
 - op: uniform_random_batch_size_like
   inputs:
      input : Input
diff --git a/paddle/phi/ops/yaml/op_version.yaml b/paddle/phi/ops/yaml/op_version.yaml
index 7ef9a6f83e84d..a41a67e9ded17 100644
--- a/paddle/phi/ops/yaml/op_version.yaml
+++ b/paddle/phi/ops/yaml/op_version.yaml
@@ -486,6 +486,14 @@
           comment : A flag to indicate whether to do softmax
           default : "true"
 
+- op : thresholded_relu
+  version :
+    - checkpoint : Upgrade thresholded_relu, add a new attribute [value]
+      action :
+        - add_attr : value
+          comment : The threshold value of thresholded_relu.
+          default : 0.0
+
 - op : trace
   version :
     - checkpoint : Upgrade trace add a new attribute [axis2]
diff --git a/paddle/phi/ops/yaml/ops.yaml b/paddle/phi/ops/yaml/ops.yaml
index e758d5e0438f0..3538cbd137762 100755
--- a/paddle/phi/ops/yaml/ops.yaml
+++ b/paddle/phi/ops/yaml/ops.yaml
@@ -321,6 +321,14 @@
   backward : assign_out__grad
   traits : pir::SideEffectTrait
 
+- op : assign_pos
+  args : (Tensor x, Tensor cum_count, Tensor eff_num_len)
+  output : Tensor(out)
+  infer_meta :
+    func : AssignPosInferMeta
+  kernel :
+    func : assign_pos
+
 - op : assign_value_
   args : (Tensor output, int[] shape, DataType dtype, Scalar[] values, Place place = {})
   output : Tensor(out)
@@ -760,6 +768,7 @@
   kernel :
     func : class_center_sample
     data_type : label
+  traits : pir::SideEffectTrait
 
 - op : clip
   args : (Tensor x, Scalar(float) min, Scalar(float) max)
@@ -1052,6 +1061,15 @@
     backend : place
   interfaces : paddle::dialect::InferSymbolicShapeInterface
 
+- op : decayed_adagrad
+  args : (Tensor param, Tensor grad, Tensor moment, Tensor learning_rate, float decay = 0.95f, float epsilon = 1.0e-6f)
+  output : Tensor(param_out), Tensor(moment_out)
+  infer_meta :
+    func : DecayedAdagradInferMeta
+  kernel :
+    func : decayed_adagrad
+    data_type : param
+
 - op : decode_jpeg
   args : (Tensor x, str mode, Place place)
   output : Tensor(out)
@@ -1262,6 +1280,7 @@
   optional : seed_tensor
   intermediate : mask
   backward : dropout_grad
+  traits : pir::SideEffectTrait
 
 - op : edit_distance
   args : (Tensor hyps, Tensor refs, Tensor hypslength, Tensor refslength, bool normalized = false)
@@ -1672,6 +1691,7 @@
   output : Tensor(out), Tensor(xshape)
   infer_meta :
     func : FlattenWithXShapeInferMeta
+    spmd_rule : FlattenInferSpmd
   kernel :
     func : flatten
     data_type : x
@@ -2658,6 +2678,31 @@
   backward : logsumexp_grad
   interfaces : paddle::dialect::InferSymbolicShapeInterface
 
+- op : lp_pool2d
+  args : (Tensor x, IntArray kernel_size, int[] strides = {1,1}, int[] paddings = {0,0}, bool ceil_mode = false, bool exclusive = true, str data_format = "NCHW", str pooling_type = "", bool global_pooling = false, bool adaptive = false, str padding_algorithm = "EXPLICIT", float norm_type = 0.0f)
+  output : Tensor(out)
+  infer_meta :
+    func : Pool2DInferMeta
+    param : [x, kernel_size, strides, paddings, ceil_mode, exclusive, data_format, pooling_type, global_pooling, adaptive, padding_algorithm]
+  kernel :
+    func : lp_pool2d
+    param : [x, kernel_size, strides, paddings, ceil_mode, exclusive, data_format, pooling_type, global_pooling, adaptive, padding_algorithm, norm_type]
+  backward : lp_pool2d_grad
+
+- op : lstm
+  args: (Tensor input, Tensor h0, Tensor c0, Tensor weight, Tensor bias, bool use_peepholes
+    = true, bool is_reverse = false, bool is_test = false, str gate_activation = "sigmoid",
+    str cell_activation = "tanh", str candidate_activation = "tanh")
+  output: Tensor (hidden), Tensor (cell), Tensor (batch_gate), Tensor (batch_cell_pre_act)
+  infer_meta:
+    func: LSTMInferMeta
+  kernel:
+    func: lstm
+    data_type: input
+  optional: h0, c0
+  intermediate: batch_gate, batch_cell_pre_act
+  backward: lstm_grad
+
 - op : lstsq
   args : (Tensor x, Tensor y, Scalar rcond=0.0f, str driver="gels")
   output : Tensor(solution), Tensor(residuals), Tensor(rank), Tensor(singular_values)
@@ -3255,6 +3300,15 @@
   backward : prod_grad
   interfaces : paddle::dialect::InferSymbolicShapeInterface
 
+- op : prune_gate_by_capacity
+  args : (Tensor gate_idx, Tensor expert_count, int64_t n_expert=0, int64_t n_worker=0)
+  output : Tensor(out_gate_idx)
+  infer_meta :
+    func : PruneGateByCapacityInferMeta
+  kernel :
+    func : prune_gate_by_capacity
+    data_type : gate_idx
+
 - op : psroi_pool
   args : (Tensor x, Tensor boxes, Tensor boxes_num, int pooled_height=1, int pooled_width=1, int output_channels=1, float spatial_scale=1.0)
   output : Tensor
@@ -3584,7 +3638,7 @@
   traits : pir::SideEffectTrait
 
 - op : rrelu
-  args : (Tensor x, float lower, float upper, bool is_test)
+  args : (Tensor x, float lower=1.0f/8, float upper=1.0f/3, bool is_test=false)
   output : Tensor(out), Tensor(noise)
   infer_meta :
     func : RReluInferMeta
@@ -3776,6 +3830,19 @@
   kernel :
     func : shard_index
 
+- op : shuffle_batch
+  args : (Tensor x, Tensor seed, int startup_seed=0)
+  output : Tensor(out), Tensor(shuffle_idx), Tensor(seed_out)
+  infer_meta:
+     func: ShuffleBatchInferMeta
+  kernel:
+     func: shuffle_batch
+     data_type: x
+  backward : shuffle_batch_grad
+  traits : pir::SideEffectTrait
+  data_transform :
+    skip_transform : seed
+
 - op : shuffle_channel
   args : (Tensor x, int group = 1)
   output : Tensor(out)
@@ -4148,6 +4215,25 @@
     func : tanh_shrink
   backward : tanh_shrink_grad
 
+- op : tdm_child
+  args: (Tensor x, Tensor tree_info, int child_nums, DataType dtype = DataType::INT32)
+  output: Tensor (child), Tensor (leaf_mask)
+  infer_meta:
+    func: TdmChildInferMeta
+  kernel:
+    func: tdm_child
+    data_type: x
+
+- op : tdm_sampler
+  args: (Tensor x, Tensor travel, Tensor layer, bool output_positive=true, int[] neg_samples_num_list={}, int[] layer_offset_lod={}, int seed = 0, int dtype=2)
+  output: Tensor(out), Tensor(labels), Tensor(mask)
+  infer_meta:
+    func : TdmSamplerInferMeta
+  kernel:
+    func : tdm_sampler
+    data_type : x
+  optional : labels
+
 - op : temporal_shift
   args : (Tensor x, int seg_num, float shift_ratio = 0.25f, str data_format = "NCHW")
   output : Tensor(out)
@@ -4170,7 +4256,7 @@
   no_need_buffer : input
 
 - op : thresholded_relu
-  args : (Tensor x, float threshold = 1.0)
+  args : (Tensor x, float threshold = 1.0, float value = 0.0)
   output : Tensor(out)
   infer_meta :
     func : UnchangedInferMeta
@@ -4374,6 +4460,7 @@
     data_type: x
   inplace: (x -> out)
   backward: uniform_inplace_grad
+  traits : pir::SideEffectTrait
 
 - op : uniform_random_batch_size_like
   args: (Tensor input, int[] shape, int input_dim_idx = 0, int output_dim_idx = 0,
@@ -4386,6 +4473,7 @@
            uniform_random_batch_size_like_sr {selected_rows -> selected_rows}
     data_type: dtype
   no_need_buffer: input
+  traits : pir::SideEffectTrait
 
 - op : unique_consecutive
   args : (Tensor x, bool return_inverse = false, bool return_counts = false, int[] axis = {}, DataType dtype = DataType::FLOAT32)
@@ -4631,3 +4719,12 @@
     func: MoeInferMeta
   kernel:
     func: moe
+
+- op: number_count
+  args: (Tensor numbers, int upper_range)
+  output: Tensor(out)
+  infer_meta:
+     func: NumberCountInferMeta
+  kernel:
+     func: number_count
+     data_type: numbers
diff --git a/paddle/phi/ops/yaml/sparse_backward.yaml b/paddle/phi/ops/yaml/sparse_backward.yaml
index 3e614b942d301..f7734af1bf6ec 100644
--- a/paddle/phi/ops/yaml/sparse_backward.yaml
+++ b/paddle/phi/ops/yaml/sparse_backward.yaml
@@ -184,6 +184,17 @@
     func : log1p_coo_grad {sparse_coo, sparse_coo -> sparse_coo},
            log1p_csr_grad {sparse_csr, sparse_csr -> sparse_csr}
 
+- backward_op : mask_as_grad
+  forward : mask_as(Tensor x, Tensor mask) -> Tensor(out)
+  args : (Tensor x, Tensor mask, Tensor out_grad)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : mask_as_coo_grad {dense, sparse_coo, sparse_coo -> dense},
+           mask_as_csr_grad {dense, sparse_csr, sparse_csr -> dense}
+
 - backward_op : masked_matmul_grad
   forward : masked_matmul(Tensor x, Tensor y, Tensor mask) -> Tensor(out)
   args : (Tensor x, Tensor y, Tensor out_grad)
diff --git a/paddle/phi/ops/yaml/sparse_ops.yaml b/paddle/phi/ops/yaml/sparse_ops.yaml
index ac230be485c09..80cef73a6c1f5 100644
--- a/paddle/phi/ops/yaml/sparse_ops.yaml
+++ b/paddle/phi/ops/yaml/sparse_ops.yaml
@@ -497,6 +497,18 @@
     func : indices_coo{sparse_coo -> dense}
     layout : x
 
+- op: mask_as
+  args : (Tensor x, Tensor mask)
+  output : Tensor(out)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : mask_as_coo{dense, sparse_coo -> sparse_coo},
+           mask_as_csr{dense, sparse_csr -> sparse_csr}
+    layout : x
+  backward: mask_as_grad
+
 - op: masked_matmul
   args : (Tensor x, Tensor y, Tensor mask)
   output : Tensor(out)
diff --git a/paddle/pir/include/core/block.h b/paddle/pir/include/core/block.h
index 25b4afe9bfc47..3756e738b22bb 100644
--- a/paddle/pir/include/core/block.h
+++ b/paddle/pir/include/core/block.h
@@ -91,7 +91,7 @@ class IR_API Block {
   bool HasOneUse() const;
   BlockOperand *first_use_addr() { return &first_use_; }
 
-  // This is a unsafe funcion, please use it carefully.
+  // This is a unsafe function, please use it carefully.
   void ResetOpListOrder(const OpListType &new_op_list);
 
   ///
diff --git a/paddle/pir/include/core/block_argument.h b/paddle/pir/include/core/block_argument.h
index b3b8c78660c34..c11fd88c9c11f 100644
--- a/paddle/pir/include/core/block_argument.h
+++ b/paddle/pir/include/core/block_argument.h
@@ -54,10 +54,10 @@ class IR_API BlockArgument : public Value {
   void Destroy();
   /// set the position in the block argument list.
   void set_index(uint32_t index);
-  // Access create annd destroy.
+  // Access create and destroy.
   friend Block;
 
-  // Access classof annd dyn_cast_from.
+  // Access classof and dyn_cast_from.
   friend Value;
   static bool classof(Value value);
   static BlockArgument dyn_cast_from(Value value);
diff --git a/paddle/pir/include/core/builtin_op.h b/paddle/pir/include/core/builtin_op.h
index e12db2e3be124..875f1c73b7565 100644
--- a/paddle/pir/include/core/builtin_op.h
+++ b/paddle/pir/include/core/builtin_op.h
@@ -39,7 +39,7 @@ class IR_API ModuleOp : public pir::Op<ModuleOp> {
   Block &block();
 
   //
-  // As the top operation, ModuleOp only support create&destroye through
+  // As the top operation, ModuleOp only support create&destroy through
   // below interface: "create"&"destroy".
   static ModuleOp Create(IrContext *context, Program *pointer);
   void Destroy();
@@ -84,7 +84,7 @@ class IR_API SetParameterOp : public pir::Op<SetParameterOp, SideEffectTrait> {
 };
 
 ///
-/// \brief ShdowOutputOp: ShdowOutputOp(OpOperand, {StrAttribute,
+/// \brief ShadowOutputOp: ShadowOutputOp(OpOperand, {StrAttribute,
 /// StrAttribute})
 ///
 class IR_API ShadowOutputOp
diff --git a/paddle/pir/include/core/builtin_type_storage.h b/paddle/pir/include/core/builtin_type_storage.h
index f706e0c66277e..0b74d8e127bf8 100644
--- a/paddle/pir/include/core/builtin_type_storage.h
+++ b/paddle/pir/include/core/builtin_type_storage.h
@@ -127,7 +127,7 @@ struct VectorTypeStorage : public TypeStorage {
   ~VectorTypeStorage() { free(data_); }
 
   ///
-  /// \brief Each derived TypeStorage must define a Construc method, which
+  /// \brief Each derived TypeStorage must define a Construct method, which
   /// StorageManager uses to construct a derived TypeStorage.
   ///
   static VectorTypeStorage* Construct(const ParamKey& key) {
diff --git a/paddle/pir/include/core/program.h b/paddle/pir/include/core/program.h
index d838916eefea5..4d0da62a98c84 100644
--- a/paddle/pir/include/core/program.h
+++ b/paddle/pir/include/core/program.h
@@ -57,6 +57,7 @@ class IR_API Program {
 
   std::shared_ptr<Program> Clone(IrMapping& ir_mapping) const;  // NOLINT
 
+  void CopyToBlock(IrMapping& ir_mapping, Block* insert_block) const;  // NOLINT
   Block* block() { return &module_.block(); }
   const Block* block() const { return &module_op().block(); }
 
@@ -70,9 +71,13 @@ class IR_API Program {
     parameters_ = parameters;
   }
 
+  uint64_t id() const { return id_; }
+
  private:
   // computation graph
   ModuleOp module_;
+  // unique in current process, "almost" unique between processes.
+  uint64_t id_;
   // weight
   ParameterMap parameters_;
 };
diff --git a/paddle/pir/include/dialect/shape/utils/shape_analysis.h b/paddle/pir/include/dialect/shape/utils/shape_analysis.h
index bbdda621511eb..0256d97dbc2b1 100644
--- a/paddle/pir/include/dialect/shape/utils/shape_analysis.h
+++ b/paddle/pir/include/dialect/shape/utils/shape_analysis.h
@@ -42,7 +42,7 @@ class IR_API InferSymbolicShapeContext {
 
   const symbol::ShapeOrDataDimExprs& GetShapeOrDataForValue(Value val) const;
 
-  void SetStaticShapeForValue(Value val);
+  void SetSymbolForValueByStaticShape(Value val);
 
   void SetShapeOrDataForValue(Value val,
                               const symbol::ShapeOrDataDimExprs& shape_or_data);
@@ -150,7 +150,7 @@ class IR_API ShapeConstraintIRAnalysis final
 
   friend void InferSymExprForAllValues(ModuleOp module_op);
 
-  void SetStaticShapeForValue(Value val);
+  void SetSymbolForValueByStaticShape(Value val);
 
   void InferShapeOrDataForValue(Value val);
 
diff --git a/paddle/pir/src/core/op_result_impl.cc b/paddle/pir/src/core/op_result_impl.cc
index 29d411c1a6c88..75261f77cf0e7 100644
--- a/paddle/pir/src/core/op_result_impl.cc
+++ b/paddle/pir/src/core/op_result_impl.cc
@@ -19,8 +19,7 @@
 #include "paddle/pir/include/core/operation.h"
 #include "paddle/pir/src/core/op_result_impl.h"
 
-namespace pir {
-namespace detail {
+namespace pir::detail {
 
 uint32_t OpResultImpl::index() const {
   if (const auto *outline_result = dyn_cast<OpOutlineResultImpl>(this)) {
@@ -111,5 +110,4 @@ OpInlineResultImpl::OpInlineResultImpl(Type type, uint32_t result_index)
           result_index));
 }
 
-}  // namespace detail
-}  // namespace pir
+}  // namespace pir::detail
diff --git a/paddle/pir/src/core/program.cc b/paddle/pir/src/core/program.cc
index 19d08f094fd4c..453cf3eb170df 100644
--- a/paddle/pir/src/core/program.cc
+++ b/paddle/pir/src/core/program.cc
@@ -13,13 +13,48 @@
 // limitations under the License.
 
 #include "paddle/pir/include/core/program.h"
+#include <limits>
+#include <mutex>
+#include <random>
+#include <unordered_set>
 #include "glog/logging.h"
 #include "paddle/pir/include/core/ir_context.h"
 
 namespace pir {
 
+namespace {
+
+int64_t GetRandomId() {
+  std::random_device rd{};
+  std::mt19937_64 gen(rd());
+  std::uniform_int_distribution<int64_t> dis(
+      0, std::numeric_limits<int64_t>::max());
+  return dis(gen);
+}
+
+bool InsertGlobalStorageSuccess(int64_t random_id) {
+  static std::unordered_set<int64_t> storage;
+  static std::mutex mutex;
+  std::unique_lock<std::mutex> lock(mutex);
+  return storage.emplace(random_id).second;
+}
+
+int64_t GetUniqueRandomId() {
+  int kLimit = 100;
+  for (int i = 0; i < kLimit; ++i) {
+    int64_t random_id = GetRandomId();
+    if (InsertGlobalStorageSuccess(random_id)) {
+      return random_id;
+    }
+  }
+  LOG(FATAL) << "Fatal bug occured in GetUniqueRandomId().";
+}
+
+}  // namespace
+
 Program::Program(IrContext* context) {
   module_ = ModuleOp::Create(context, this);
+  id_ = GetUniqueRandomId();
 }
 
 Program::~Program() {
@@ -39,6 +74,26 @@ std::shared_ptr<Program> Program::Clone(IrMapping& ir_mapping) const {
   return new_program;
 }
 
+void Program::CopyToBlock(IrMapping& ir_mapping, Block* insert_block) const {
+  auto clone_options = CloneOptions::All();
+  for (const auto& op : *block()) {
+    bool skip_op = false;
+    for (uint32_t i = 0; i < op.num_results(); i++) {
+      if (ir_mapping.GetMutableMap<pir::Value>().count(op.result(i))) {
+        skip_op = true;
+        break;
+      }
+    }
+    if (skip_op) {
+      continue;
+    }
+
+    auto* new_op = op.Clone(ir_mapping, clone_options);
+    insert_block->push_back(new_op);
+  }
+  return;
+}
+
 Parameter* Program::GetParameter(const std::string& name) const {
   if (parameters_.count(name) != 0) {
     return parameters_.at(name).get();
diff --git a/paddle/pir/src/dialect/shape/interface/infer_symbolic_shape/infer_symbolic_shape.cc b/paddle/pir/src/dialect/shape/interface/infer_symbolic_shape/infer_symbolic_shape.cc
index f1c44e945f60c..d635a0ac5cc52 100644
--- a/paddle/pir/src/dialect/shape/interface/infer_symbolic_shape/infer_symbolic_shape.cc
+++ b/paddle/pir/src/dialect/shape/interface/infer_symbolic_shape/infer_symbolic_shape.cc
@@ -18,7 +18,7 @@
 // cinn operators.
 
 // Add `interfaces : pir::InferSymbolicShapeInterface` in relative
-// yaml file to conresponding op.
+// yaml file to corresponding op.
 
 // Since necessary checks have been done in the Op's `InferMeta` and `VeriySig`,
 // no more repetitive work here.
diff --git a/paddle/pir/src/dialect/shape/transforms/shape_optimization_pass.cc b/paddle/pir/src/dialect/shape/transforms/shape_optimization_pass.cc
index 343b1bf329c2c..e51cf34aa4bc9 100644
--- a/paddle/pir/src/dialect/shape/transforms/shape_optimization_pass.cc
+++ b/paddle/pir/src/dialect/shape/transforms/shape_optimization_pass.cc
@@ -126,6 +126,10 @@ void DebugPrintOpInfo(pir::Operation* op,
   std::ostringstream print_stream;
   for (uint32_t i = 0; i < op->num_results(); ++i) {
     const auto& res = op->result(i);
+    if (!res || !res.type()) {
+      continue;
+    }
+
     print_stream << "\tresult(" << res.dyn_cast<pir::OpResult>().index() << ") "
                  << "ShapeOrData: {";
 
@@ -170,6 +174,10 @@ void CheckInferSymWithInferMeta(
     pir::InferSymbolicShapeContext* infer_context = nullptr) {
   for (uint32_t i = 0; i < op->num_results(); ++i) {
     const auto& res = op->result(i);
+    if (!res || !res.type()) {
+      continue;
+    }
+
     std::ostringstream print_stream;
 
     // InferMeta funcs of some Ops are not corrrect now, we don't check them.
@@ -299,7 +307,7 @@ void InferSymExprForBlock(const Block& block,
                    << " DOES NOT have InferSymbolicShapeInterface!";
       }
       for (uint32_t i = 0; i < op.num_results(); ++i) {
-        infer_context->SetStaticShapeForValue(op.result(i));
+        infer_context->SetSymbolForValueByStaticShape(op.result(i));
       }
     }
     DebugPrintOpInfo(&op, infer_context);
@@ -314,6 +322,9 @@ void InferSymExprForAllValues(ModuleOp module_op) {
   auto infer_context = shape_analysis.MutInferSymbolicShapeContext();
   for (uint32_t i = 0; i < module_op->num_regions(); i++) {
     for (auto& block : module_op->region(i)) {
+      for (auto& [_, value] : block.kwargs()) {
+        infer_context->SetSymbolForValueByStaticShape(value);
+      }
       InferSymExprForBlock(block, infer_context);
     }
   }
diff --git a/paddle/pir/src/dialect/shape/utils/constraints_manager.cc b/paddle/pir/src/dialect/shape/utils/constraints_manager.cc
index bdb9e52a49507..7b2a887cfaa8c 100644
--- a/paddle/pir/src/dialect/shape/utils/constraints_manager.cc
+++ b/paddle/pir/src/dialect/shape/utils/constraints_manager.cc
@@ -100,22 +100,22 @@ void ConstraintsManager::AddEqCstr(const DimExpr& lhs, const DimExpr& rhs) {
     equals_.Union(lhs, rhs);
     VLOG(4) << "add equal constraint: " << lhs << " == " << rhs;
   }
-  DimExpr origin, subsutituted;
+  DimExpr origin, substituted;
   auto comp_result = CompareDimExprPriority(lhs, rhs);
   if (comp_result == PriorityComparisonStatus::LOWER) {
     origin = lhs;
-    subsutituted = rhs;
+    substituted = rhs;
   } else if (comp_result == PriorityComparisonStatus::HIGHER) {
     origin = rhs;
-    subsutituted = lhs;
+    substituted = lhs;
   } else {
     return;
   }
-  if (CanSubstituteInConstraint(origin, subsutituted)) {
-    SubstituteInConstraint(origin, subsutituted);
+  if (CanSubstituteInConstraint(origin, substituted)) {
+    SubstituteInConstraint(origin, substituted);
   }
   if (equal_callback_func_) {
-    equal_callback_func_(origin, subsutituted);
+    equal_callback_func_(origin, substituted);
   }
 }
 
diff --git a/paddle/pir/src/dialect/shape/utils/dim_expr_util.cc b/paddle/pir/src/dialect/shape/utils/dim_expr_util.cc
index c7b5e21a2e01b..c622194e602eb 100644
--- a/paddle/pir/src/dialect/shape/utils/dim_expr_util.cc
+++ b/paddle/pir/src/dialect/shape/utils/dim_expr_util.cc
@@ -751,7 +751,7 @@ struct FoldOperandTrait<Broadcast> {
       PADDLE_ENFORCE_EQ(
           *value,
           expr_value,
-          phi::errors::InvalidArgument("The value (%d) should be equel to expr "
+          phi::errors::InvalidArgument("The value (%d) should be equal to expr "
                                        "(%d) when they are both not 1.",
                                        *value,
                                        expr_value));
@@ -887,7 +887,7 @@ struct FoldRedundantSymbolicBroadcast {
                 ret.value().value,
                 int64_value,
                 phi::errors::InvalidArgument(
-                    "The value of return (%d) should be equel to expr (%d) of "
+                    "The value of return (%d) should be equal to expr (%d) of "
                     "operands at index (%d) when they are both > 1.",
                     ret.value().value,
                     int64_value,
diff --git a/paddle/pir/src/dialect/shape/utils/shape_analysis.cc b/paddle/pir/src/dialect/shape/utils/shape_analysis.cc
index b62ad0f2a3d95..3c51cf57226c4 100644
--- a/paddle/pir/src/dialect/shape/utils/shape_analysis.cc
+++ b/paddle/pir/src/dialect/shape/utils/shape_analysis.cc
@@ -67,14 +67,15 @@ InferSymbolicShapeContext::GetShapeOrDataForValue(Value val) const {
   return value_id_to_shape_or_data_.at(val.impl()->id());
 }
 
-void InferSymbolicShapeContext::SetStaticShapeForValue(Value val) {
+void InferSymbolicShapeContext::SetSymbolForValueByStaticShape(Value val) {
   const auto& value_type = val.type();
   if (!val || !value_type) {
-    PADDLE_THROW(
-        phi::errors::Fatal("Set static shape for null value is FOBBIDEN!"));
+    LOG(WARNING) << "Risk on SetSymbolForValueByStaticShape for null value";
+    return;
   }
   if (!IsStaticShape(val)) {
-    LOG(WARNING) << "Risk on SetStaticShapeForValue for contain_unknown_dim";
+    LOG(WARNING)
+        << "Risk on SetSymbolForValueByStaticShape for contain_unknown_dim";
   }
   const auto& GetStaticShapeForDenseTensorType =
       [&](DenseTensorType type_info) -> symbol::TensorShapeOrDataDimExprs {
@@ -289,8 +290,8 @@ const std::string ShapeConstraintIRAnalysis::GetNextSymName() {
   return context_.GetNextSymName();
 }
 
-void ShapeConstraintIRAnalysis::SetStaticShapeForValue(Value val) {
-  context_.SetStaticShapeForValue(val);
+void ShapeConstraintIRAnalysis::SetSymbolForValueByStaticShape(Value val) {
+  context_.SetSymbolForValueByStaticShape(val);
 }
 
 void ShapeConstraintIRAnalysis::InferShapeOrDataForValue(Value val) {
@@ -319,7 +320,7 @@ void ShapeConstraintIRAnalysis::InferShapeOrDataForValue(Value val) {
         for (auto& operand : GetRealOperandSource(op)) {
           if (operand.impl() && !context_.HasShapeOrDataForValue(operand)) {
             if (!operand.defining_op()) {
-              SetStaticShapeForValue(operand);
+              SetSymbolForValueByStaticShape(operand);
             } else {
               Visit(operand.defining_op());
             }
@@ -334,7 +335,7 @@ void ShapeConstraintIRAnalysis::InferShapeOrDataForValue(Value val) {
     for (auto& operand : GetRealOperandSource(op)) {
       if (operand.impl() && !context_.HasShapeOrDataForValue(operand)) {
         if (!operand.defining_op()) {
-          SetStaticShapeForValue(operand);
+          SetSymbolForValueByStaticShape(operand);
         } else {
           has_prev_op = true;
         }
@@ -379,22 +380,23 @@ void ShapeConstraintIRAnalysis::InferShapeOrDataForValue(Value val) {
     if (infer_symbolic_shape_interface) {
       infer_symbolic_shape_interface.InferSymbolicShape(&context_);
       for (auto& result_value : op->results()) {
-        if (result_value && (!context_.HasShapeOrDataForValue(result_value))) {
+        if (!result_value || !result_value.type()) {
+          continue;
+        }
+        if (!context_.HasShapeOrDataForValue(result_value)) {
           PADDLE_THROW(phi::errors::Fatal(op->name() +
                                           " HAS ERROR on InferSymbolicShape!"));
         }
       }
     } else {
-      // TODO(Hongqing-work): throw it after the shape analysis reconstruct
-      // is done.
-      // PADDLE_THROW(phi::errors::Unimplemented(
-      //     val.defining_op()->name() +
-      //     " DOES NOT have InferSymbolicShapeInterface!"));
       LOG(WARNING) << op->name()
                    << " DOES NOT have InferSymbolicShapeInterface!";
       for (auto& result_value : op->results()) {
-        if (result_value && (!context_.HasShapeOrDataForValue(result_value))) {
-          SetStaticShapeForValue(result_value);
+        if (!result_value || !result_value.type()) {
+          continue;
+        }
+        if (!context_.HasShapeOrDataForValue(result_value)) {
+          SetSymbolForValueByStaticShape(result_value);
         }
       }
     }
@@ -412,7 +414,7 @@ ShapeConstraintIRAnalysis::GetShapeOrDataForValue(Value val) {
   if (!context_.HasShapeOrDataForValue(val)) {
     // backtrack to infer shape from defining op
     if (!val.defining_op()) {
-      SetStaticShapeForValue(val);
+      SetSymbolForValueByStaticShape(val);
     } else {
       VLOG(3) << "InferShapeOrDataForValue,  defining_op: "
               << val.defining_op()->name();
diff --git a/paddle/scripts/build_docker_images.sh b/paddle/scripts/build_docker_images.sh
index 2b584cdca6b4c..e078e473f573f 100644
--- a/paddle/scripts/build_docker_images.sh
+++ b/paddle/scripts/build_docker_images.sh
@@ -1,13 +1,13 @@
 #!/bin/sh
 
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/paddle/scripts/fast_install.sh b/paddle/scripts/fast_install.sh
index 5793a38d6ef3a..90d7af6a0c0df 100644
--- a/paddle/scripts/fast_install.sh
+++ b/paddle/scripts/fast_install.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -468,7 +468,7 @@ function PipLinuxInstall(){
             fi
         else
           echo paddlepaddle whl包下载失败
-          echo "wget err: $wheel_gpu_develop" 
+          echo "wget err: $wheel_gpu_develop"
           exit 1
         fi
   else
diff --git a/paddle/scripts/musl_build/Dockerfile b/paddle/scripts/musl_build/Dockerfile
index 1c53284cef6b3..babf3f6050039 100644
--- a/paddle/scripts/musl_build/Dockerfile
+++ b/paddle/scripts/musl_build/Dockerfile
@@ -1,11 +1,11 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/paddle/scripts/musl_build/build_docker.sh b/paddle/scripts/musl_build/build_docker.sh
index 0739cbdf731c8..c822a3a225136 100755
--- a/paddle/scripts/musl_build/build_docker.sh
+++ b/paddle/scripts/musl_build/build_docker.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -46,7 +46,7 @@ function build_image(){
     declare -a BUILD_ARGS
     BUILD_ARGS+=("--build-arg" "PYTHON_VERSION=$PYTHON_VERSION")
     echo ">>> python version: $PYTHON_VERSION"
-    
+
     if [ "$HTTP_PROXY" ]; then
         BUILD_ARGS+=("--build-arg" "http_proxy=$HTTP_PROXY")
         echo ">>> using http proxy: $HTTP_PROXY"
@@ -81,7 +81,7 @@ function build_image(){
         echo ">>> with pip index: $WITH_PIP_INDEX"
         BUILD_ARGS+=("--build-arg" pip_index="$WITH_PIP_INDEX")
     fi
-        
+
     echo ">>> build docker image: $BUILD_IMAGE"
     # shellcheck disable=2086
     docker build \
diff --git a/paddle/scripts/musl_build/build_inside.sh b/paddle/scripts/musl_build/build_inside.sh
index 4c7fa804de578..297f1f058e0e4 100755
--- a/paddle/scripts/musl_build/build_inside.sh
+++ b/paddle/scripts/musl_build/build_inside.sh
@@ -1,13 +1,13 @@
 #!/bin/sh
 
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -33,13 +33,13 @@ cd "$BUILD_DIR"
 # setup root dir
 chown -R root:root /root
 
-if [ "$HTTP_PROXY" ]; then 
-    echo ">>> http_proxy: $HTTP_PROXY" 
+if [ "$HTTP_PROXY" ]; then
+    echo ">>> http_proxy: $HTTP_PROXY"
     git config --global http.proxy "$HTTP_PROXY"
 fi
 
-if [ "$HTTP_PROXY" ]; then 
-    echo ">>> https_proxy: $HTTPS_PROXY" 
+if [ "$HTTP_PROXY" ]; then
+    echo ">>> https_proxy: $HTTPS_PROXY"
     git config --global https.proxy "$HTTPS_PROXY"
 fi
 
diff --git a/paddle/scripts/musl_build/build_paddle.sh b/paddle/scripts/musl_build/build_paddle.sh
index 879bb823c2714..cfeba3cf92632 100755
--- a/paddle/scripts/musl_build/build_paddle.sh
+++ b/paddle/scripts/musl_build/build_paddle.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/paddle/scripts/musl_build/config.sh b/paddle/scripts/musl_build/config.sh
index ded239a2a4da7..4972876c3bd03 100755
--- a/paddle/scripts/musl_build/config.sh
+++ b/paddle/scripts/musl_build/config.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 45b796671852e..82f06bc1b4030 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -94,6 +94,7 @@ if not defined retry_times set retry_times=1
 if not defined PYTHON_ROOT set PYTHON_ROOT=C:\Python38
 if not defined BUILD_DIR set BUILD_DIR=build
 if not defined TEST_INFERENCE set TEST_INFERENCE=ON
+if not defined WITH_PIP_CUDA_LIBRARIES set WITH_PIP_CUDA_LIBRARIES=OFF
 
 set task_name=%1
 set UPLOAD_TP_FILE=OFF
@@ -301,6 +302,7 @@ rem ------Build windows avx whl package------
 :CASE_build_avx_whl
 set WITH_AVX=ON
 set ON_INFER=ON
+set WITH_PIP_CUDA_LIBRARIES=ON
 if not defined CUDA_ARCH_NAME set CUDA_ARCH_NAME=All
 
 call :cmake || goto cmake_error
@@ -501,12 +503,15 @@ echo %task_name%|findstr build >nul && (
 )
 
 :cmake_impl
+if "%WITH_TESTING%"=="ON" (
+    cd /d %work_dir%\%BUILD_DIR%
+    rem whether to run cpp test
+    python -m pip install PyGithub
+    python %work_dir%\tools\check_only_change_python_files.py
+    if exist %work_dir%\%BUILD_DIR%\only_change_python_file.txt set WITH_CPP_TEST=OFF
+    echo WITH_CPP_TEST: %WITH_CPP_TEST%
+)
 cd /d %work_dir%\%BUILD_DIR%
-rem whether to run cpp test
-python -m pip install PyGithub
-python %work_dir%\tools\check_only_change_python_files.py
-if exist %work_dir%\%BUILD_DIR%\only_change_python_file.txt set WITH_CPP_TEST=OFF
-echo WITH_CPP_TEST: %WITH_CPP_TEST%
 echo cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
 -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DPYTHON_EXECUTABLE=%PYTHON_EXECUTABLE% -DON_INFER=%ON_INFER% ^
 -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
@@ -515,7 +520,7 @@ echo cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -D
 -DWITH_UNITY_BUILD=%WITH_UNITY_BUILD% -DCUDA_ARCH_NAME=%CUDA_ARCH_NAME% -DCUDA_ARCH_BIN=%CUDA_ARCH_BIN% -DCUB_PATH=%THIRD_PARTY_HOME%/cub ^
 -DCUDA_TOOLKIT_ROOT_DIR="%CUDA_TOOLKIT_ROOT_DIR%" -DNEW_RELEASE_ALL=%NEW_RELEASE_ALL% -DNEW_RELEASE_PYPI=%NEW_RELEASE_PYPI% ^
 -DNEW_RELEASE_JIT=%NEW_RELEASE_JIT% -DWITH_ONNXRUNTIME=%WITH_ONNXRUNTIME% -DWITH_CPP_TEST=%WITH_CPP_TEST% ^
--DWIN_UNITTEST_LEVEL=%WIN_UNITTEST_LEVEL% -DWITH_NIGHTLY_BUILD=%WITH_NIGHTLY_BUILD%
+-DWIN_UNITTEST_LEVEL=%WIN_UNITTEST_LEVEL% -DWITH_NIGHTLY_BUILD=%WITH_NIGHTLY_BUILD% -DWITH_PIP_CUDA_LIBRARIES=%WITH_PIP_CUDA_LIBRARIES% >> %work_dir%\win_cmake.sh
 
 echo cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
 -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DPYTHON_EXECUTABLE=%PYTHON_EXECUTABLE% -DON_INFER=%ON_INFER% ^
@@ -525,7 +530,7 @@ echo cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -D
 -DWITH_UNITY_BUILD=%WITH_UNITY_BUILD% -DCUDA_ARCH_NAME=%CUDA_ARCH_NAME% -DCUDA_ARCH_BIN=%CUDA_ARCH_BIN% -DCUB_PATH=%THIRD_PARTY_HOME%/cub ^
 -DCUDA_TOOLKIT_ROOT_DIR="%CUDA_TOOLKIT_ROOT_DIR%" -DNEW_RELEASE_ALL=%NEW_RELEASE_ALL% -DNEW_RELEASE_PYPI=%NEW_RELEASE_PYPI% ^
 -DNEW_RELEASE_JIT=%NEW_RELEASE_JIT% -DWITH_ONNXRUNTIME=%WITH_ONNXRUNTIME% -DWITH_CPP_TEST=%WITH_CPP_TEST% ^
--DWIN_UNITTEST_LEVEL=%WIN_UNITTEST_LEVEL% -DWITH_NIGHTLY_BUILD=%WITH_NIGHTLY_BUILD% >> %work_dir%\win_cmake.sh
+-DWIN_UNITTEST_LEVEL=%WIN_UNITTEST_LEVEL% -DWITH_NIGHTLY_BUILD=%WITH_NIGHTLY_BUILD% -DWITH_PIP_CUDA_LIBRARIES=%WITH_PIP_CUDA_LIBRARIES% >> %work_dir%\win_cmake.sh
 
 cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
 -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DPYTHON_EXECUTABLE=%PYTHON_EXECUTABLE% -DON_INFER=%ON_INFER% ^
@@ -535,7 +540,7 @@ cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_
 -DWITH_UNITY_BUILD=%WITH_UNITY_BUILD% -DCUDA_ARCH_NAME=%CUDA_ARCH_NAME% -DCUDA_ARCH_BIN=%CUDA_ARCH_BIN% -DCUB_PATH=%THIRD_PARTY_HOME%/cub ^
 -DCUDA_TOOLKIT_ROOT_DIR="%CUDA_TOOLKIT_ROOT_DIR%" -DNEW_RELEASE_ALL=%NEW_RELEASE_ALL% -DNEW_RELEASE_PYPI=%NEW_RELEASE_PYPI% ^
 -DNEW_RELEASE_JIT=%NEW_RELEASE_JIT% -DWITH_ONNXRUNTIME=%WITH_ONNXRUNTIME% -DWITH_CPP_TEST=%WITH_CPP_TEST% ^
--DWIN_UNITTEST_LEVEL=%WIN_UNITTEST_LEVEL% -DWITH_NIGHTLY_BUILD=%WITH_NIGHTLY_BUILD%
+-DWIN_UNITTEST_LEVEL=%WIN_UNITTEST_LEVEL% -DWITH_NIGHTLY_BUILD=%WITH_NIGHTLY_BUILD% -DWITH_PIP_CUDA_LIBRARIES=%WITH_PIP_CUDA_LIBRARIES%
 goto:eof
 
 :cmake_error
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 8c0266c36e8c1..e793c210628be 100644
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -1130,7 +1130,10 @@ function check_whl_size() {
 
 function generate_upstream_develop_api_spec() {
     set -x
+    # Temporarily save some scripts from PR branch
     cp ${PADDLE_ROOT}/python/requirements.txt /tmp
+    cp ${PADDLE_ROOT}/tools/print_signatures.py /tmp
+
     mkdir -p ${PADDLE_ROOT}/build/pr_whl && mv ${PADDLE_ROOT}/build/python/dist/*.whl ${PADDLE_ROOT}/build/pr_whl/
     pr_whl_size=`du -m ${PADDLE_ROOT}/build/python/dist/*.whl|awk '{print $1}'`
     echo "pr_whl_size: ${pr_whl_size}"
@@ -1178,17 +1181,20 @@ function generate_api_spec() {
         echo "Not supported $2"
         exit 1
     fi
+    if [ "$spec_kind" == "DEV" ]; then
+        REQUIREMENTS_PATH=/tmp/requirements.txt
+        PRINT_SIGNATURES_SCRIPT_PATH=/tmp/print_signatures.py
+    else
+        REQUIREMENTS_PATH=${PADDLE_ROOT}/python/requirements.txt
+        PRINT_SIGNATURES_SCRIPT_PATH=${PADDLE_ROOT}/tools/print_signatures.py
+    fi
 
     mkdir -p ${PADDLE_ROOT}/build/.check_api_workspace
     cd ${PADDLE_ROOT}/build/.check_api_workspace
     virtualenv -p `which python` .${spec_kind}_env
     source .${spec_kind}_env/bin/activate
+    pip install -r $REQUIREMENTS_PATH
 
-    if [ "$spec_kind" == "DEV" ]; then
-        pip install -r /tmp/requirements.txt
-    else
-        pip install -r ${PADDLE_ROOT}/python/requirements.txt
-    fi
     if [ -d "${PADDLE_ROOT}/build/python/dist/" ]; then
         pip install ${PADDLE_ROOT}/build/python/dist/*whl
     elif [ -d "${PADDLE_ROOT}/dist/" ];then
@@ -1196,7 +1202,10 @@ function generate_api_spec() {
         mkdir ${PADDLE_ROOT}/build/python/dist/ && mv  ${PADDLE_ROOT}/dist/*whl  ${PADDLE_ROOT}/build/python/dist/
     fi
     spec_path=${PADDLE_ROOT}/paddle/fluid/API_${spec_kind}.spec
-    python ${PADDLE_ROOT}/tools/print_signatures.py paddle > $spec_path
+    python ${PRINT_SIGNATURES_SCRIPT_PATH} paddle > $spec_path
+    python ${PRINT_SIGNATURES_SCRIPT_PATH} --show-fields="args,varargs,varkw,defaults,kwonlyargs,kwonlydefaults" paddle > ${spec_path}.api
+    python ${PRINT_SIGNATURES_SCRIPT_PATH} --show-fields="annotations" paddle > ${spec_path}.annotations
+    python ${PRINT_SIGNATURES_SCRIPT_PATH} --show-fields="document" paddle > ${spec_path}.doc
 
     # used to log op_register data_type
     op_type_path=${PADDLE_ROOT}/paddle/fluid/OP_TYPE_${spec_kind}.spec
@@ -1214,9 +1223,6 @@ function generate_api_spec() {
     api_source_md5_path=${PADDLE_ROOT}/paddle/fluid/API_${spec_kind}.source.md5
     python ${PADDLE_ROOT}/tools/count_api_without_core_ops.py -p paddle > $api_source_md5_path
 
-    awk -F '(' '{print $NF}' $spec_path >${spec_path}.doc
-    awk -F '(' '{$NF="";print $0}' $spec_path >${spec_path}.api
-
     python ${PADDLE_ROOT}/tools/diff_use_default_grad_op_maker.py \
         ${PADDLE_ROOT}/paddle/fluid/op_use_default_grad_maker_${spec_kind}.spec
 
@@ -1474,7 +1480,7 @@ function card_test() {
     if [ "${WITH_XPU}" == "ON" ];then
         CUDA_DEVICE_COUNT=1
     elif [ "${WITH_ROCM}" == "ON" ];then
-        CUDA_DEVICE_COUNT=$(rocm-smi -i | grep GPU | wc -l)
+        CUDA_DEVICE_COUNT=$(rocm-smi -i | grep DCU | wc -l)
     elif [ "${WITH_IPU}" == "ON" ];then
         CUDA_DEVICE_COUNT=1
     else
@@ -1517,13 +1523,22 @@ function card_test() {
             if [[ $cardnumber == $CUDA_DEVICE_COUNT ]]; then
                 (ctest -I $i,,$NUM_PROC -R "($testcases)" -E "($disable_ut_quickly)" ${run_label_mode} -V --timeout 120 -j $parallel_job | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) &
             else
-                (env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC -R "($testcases)" -E "($disable_ut_quickly)" ${run_label_mode} --timeout 120 -V -j $parallel_job | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) &
+                if [ "$WITH_ROCM" == "ON" ];then
+                    (env HIP_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC -R "($testcases)" -E "($disable_ut_quickly)" ${run_label_mode} --timeout 120 -V -j $parallel_job | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) &
+                else
+                    (env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC -R "($testcases)" -E "($disable_ut_quickly)" ${run_label_mode} --timeout 120 -V -j $parallel_job | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) &
+                fi
             fi
         else
             if [[ $cardnumber == $CUDA_DEVICE_COUNT ]]; then
                 (ctest -I $i,,$NUM_PROC -R "($testcases)" -E "($disable_ut_quickly)" ${run_label_mode} --timeout 120 --output-on-failure  -j $parallel_job | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) &
             else
-                (env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC -R "($testcases)" -E "($disable_ut_quickly)" ${run_label_mode} --timeout 120 --output-on-failure  -j $parallel_job | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) &
+                if [ "$WITH_ROCM" == "ON" ];then
+                    (env HIP_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC -R "($testcases)" -E "($disable_ut_quickly)" ${run_label_mode} --timeout 120 --output-on-failure  -j $parallel_job | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) &
+                else
+                    (env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC -R "($testcases)" -E "($disable_ut_quickly)" ${run_label_mode} --timeout 120 --output-on-failure  -j $parallel_job | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) &
+                fi
+                
             fi
         fi
     done
@@ -2652,7 +2667,11 @@ set -x
         fi
         if [ -a "$PADDLE_ROOT/added_ut" ];then
             added_uts=^$(awk BEGIN{RS=EOF}'{gsub(/\n/,"$|^");print}' $PADDLE_ROOT/added_ut)$
-            env CUDA_VISIBLE_DEVICES=0 ctest -R "(${added_uts})" -LE "RUN_TYPE=DIST|RUN_TYPE=EXCLUSIVE|RUN_TYPE=HYBRID" --output-on-failure --repeat-until-fail 3 --timeout 15;added_ut_error=$?
+            if [ "$WITH_ROCM" == "ON" ];then
+                env HIP_VISIBLE_DEVICES=0 ctest -R "(${added_uts})" -LE "RUN_TYPE=DIST|RUN_TYPE=EXCLUSIVE|RUN_TYPE=HYBRID" --output-on-failure --repeat-until-fail 3 --timeout 15;added_ut_error=$?
+            else
+                env CUDA_VISIBLE_DEVICES=0 ctest -R "(${added_uts})" -LE "RUN_TYPE=DIST|RUN_TYPE=EXCLUSIVE|RUN_TYPE=HYBRID" --output-on-failure --repeat-until-fail 3 --timeout 15;added_ut_error=$?
+            fi
             ctest -R "(${added_uts})" -L "RUN_TYPE=DIST|RUN_TYPE=EXCLUSIVE" --output-on-failure --repeat-until-fail 3 --timeout 15;added_ut_error_1=$?
             if [ "$added_ut_error" != 0 ] && [ "$added_ut_error_1" != 0 ];then
                 echo "========================================"
@@ -2826,7 +2845,9 @@ set +x
         rerun_ut_endTime_s=`date +%s`
         echo "ipipe_log_param_Rerun_TestCases_Total_Time: $[ $rerun_ut_endTime_s - $rerun_ut_startTime_s ]s"
         echo "ipipe_log_param_Rerun_TestCases_Total_Time: $[ $rerun_ut_endTime_s - $rerun_ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
-        cp $PADDLE_ROOT/build/Testing/Temporary/CTestCostData.txt ${cfs_dir}/coverage/${AGILE_PULL_ID}/${AGILE_REVISION}/
+        if [ "$WITH_ROCM" != "ON" ];then
+            cp $PADDLE_ROOT/build/Testing/Temporary/CTestCostData.txt ${cfs_dir}/coverage/${AGILE_PULL_ID}/${AGILE_REVISION}/
+        fi
         if [[ "$EXIT_CODE" != "0" ]]; then
             show_ut_retry_result
         fi
@@ -3488,7 +3509,6 @@ function build_document_preview() {
     sh /paddle/tools/document_preview.sh ${PORT}
 }
 
-
 # origin name: example
 function exec_samplecode_test() {
     if [ -d "${PADDLE_ROOT}/build/pr_whl" ];then
@@ -3502,10 +3522,10 @@ function exec_samplecode_test() {
 
     cd ${PADDLE_ROOT}/tools
     if [ "$1" = "cpu" ] ; then
-        python sampcd_processor.py --debug --mode cpu; example_error=$?
+        python sampcd_processor.py --mode cpu; example_error=$?
     elif [ "$1" = "gpu" ] ; then
         SAMPLE_CODE_EXEC_THREADS=${SAMPLE_CODE_EXEC_THREADS:-2}
-        python sampcd_processor.py --threads=${SAMPLE_CODE_EXEC_THREADS} --debug --mode gpu; example_error=$?
+        python sampcd_processor.py --threads=${SAMPLE_CODE_EXEC_THREADS} --mode gpu; example_error=$?
     fi
     if [ "$example_error" != "0" ];then
       echo "Code instance execution failed" >&2
@@ -3513,6 +3533,75 @@ function exec_samplecode_test() {
     fi
 }
 
+function need_type_checking() {
+    set +x
+
+    # check pr title
+    TITLE_CHECK=`curl -s https://github.com/PaddlePaddle/Paddle/pull/${GIT_PR_ID} | grep "<title>" | grep -i "typing" || true`
+
+    if [[ ${TITLE_CHECK} ]]; then
+        set -x
+        return 0
+    else
+        set -x
+        return 1
+    fi
+}
+
+function exec_type_checking() {
+    if [ -d "${PADDLE_ROOT}/build/pr_whl" ];then
+        pip install ${PADDLE_ROOT}/build/pr_whl/*.whl
+    else
+        echo "WARNING: PR wheel is not found. Use develop wheel !!!"
+        pip install ${PADDLE_ROOT}/build/python/dist/*.whl
+    fi
+
+    python -c "import paddle;print(paddle.__version__);paddle.version.show()"
+
+    cd ${PADDLE_ROOT}/tools
+    
+    # check all sample code
+    TITLE_CHECK_ALL=`curl -s https://github.com/PaddlePaddle/Paddle/pull/${GIT_PR_ID} | grep "<title>" | grep -i "typing all" || true`
+
+    if [[ ${TITLE_CHECK_ALL} ]]; then
+        python type_checking.py --full-test; type_checking_error=$?
+    else
+        python type_checking.py; type_checking_error=$?
+    fi
+
+    if [ "$type_checking_error" != "0" ];then
+      echo "Example code type checking failed" >&2
+      exit 5
+    fi
+}
+
+
+function exec_samplecode_checking() {
+    example_info_gpu=""
+    example_code_gpu=0
+    if [ "${WITH_GPU}" == "ON" ] ; then
+        { example_info_gpu=$(exec_samplecode_test gpu 2>&1 1>&3 3>/dev/null); } 3>&1
+        example_code_gpu=$?
+    fi
+    { example_info=$(exec_samplecode_test cpu 2>&1 1>&3 3>/dev/null); } 3>&1
+    example_code=$?
+
+    # TODO(megemini): type_checkding should be default after type annotation been done.
+    need_type_checking
+    type_checking_status=$?
+
+    if [[ ${type_checking_status} -eq 0 ]]; then
+        { type_checking_info=$(exec_type_checking 2>&1 1>&3 3>/dev/null); } 3>&1
+        type_checking_code=$?
+    fi
+
+    summary_check_example_code_problems $[${example_code_gpu} + ${example_code}] "${example_info_gpu}\n${example_info}"
+
+    if [[ ${type_checking_status} -eq 0 ]]; then
+        summary_type_checking_problems $type_checking_code "$type_checking_info"
+    fi
+}
+
 
 function collect_ccache_hits() {
     ccache -s
@@ -3553,10 +3642,11 @@ function test_model_benchmark() {
     bash ${PADDLE_ROOT}/tools/test_model_benchmark.sh
 }
 
-function summary_check_problems() {
+function summary_check_example_code_problems() {
     set +x
     local example_code=$1
     local example_info=$2
+
     if [ $example_code -ne 0 ];then
         echo "==============================================================================="
         echo "*****Example code error***** Please fix the error listed in the information:"
@@ -3579,6 +3669,33 @@ function summary_check_problems() {
 }
 
 
+function summary_type_checking_problems() {
+    set +x
+    local type_checking_code=$1
+    local type_checking_info=$2
+
+    if [ $type_checking_code -ne 0 ];then
+        echo "==============================================================================="
+        echo "*****Example code type checking error***** Please fix the error listed in the information:"
+        echo "==============================================================================="
+        echo "$type_checking_info"
+        echo "==============================================================================="
+        echo "*****Example code type checking FAIL*****"
+        echo "==============================================================================="
+        exit $type_checking_code
+    else
+        echo "==============================================================================="
+        echo "*****Example code type checking info*****"
+        echo "==============================================================================="
+        echo "$type_checking_info"
+        echo "==============================================================================="
+        echo "*****Example code type checking PASS*****"
+        echo "==============================================================================="
+    fi
+    set -x
+}
+
+
 function reuse_so_cache() {
     get_html="https://api.github.com/repos/PaddlePaddle/Paddle"
     curl -X GET ${get_html}/commits -H "authorization: token ${GITHUB_API_TOKEN}" >tmp.txt
@@ -3631,7 +3748,10 @@ function build_pr_and_develop() {
     fi
     mv ${PADDLE_ROOT}/dist/*.whl ${PADDLE_ROOT}/build/python/dist/
     cmake_change=`git diff --name-only upstream/$BRANCH | grep "cmake/external" || true`
+    # Temporarily save some scripts from PR branch
     cp ${PADDLE_ROOT}/python/requirements.txt /tmp
+    cp ${PADDLE_ROOT}/tools/print_signatures.py /tmp
+
     generate_api_spec "$1" "PR"
     mkdir ${PADDLE_ROOT}/build/pr_whl && cp ${PADDLE_ROOT}/build/python/dist/*.whl ${PADDLE_ROOT}/build/pr_whl
     rm -f ${PADDLE_ROOT}/build/python/dist/*.whl && rm -f ${PADDLE_ROOT}/build/python/build/.timestamp
@@ -4262,15 +4382,7 @@ function main() {
         check_sequence_op_unittest
         generate_api_spec ${PYTHON_ABI:-""} "PR"
         set +e
-        example_info_gpu=""
-        example_code_gpu=0
-        if [ "${WITH_GPU}" == "ON" ] ; then
-            { example_info_gpu=$(exec_samplecode_test gpu 2>&1 1>&3 3>/dev/null); } 3>&1
-            example_code_gpu=$?
-        fi
-        { example_info=$(exec_samplecode_test cpu 2>&1 1>&3 3>/dev/null); } 3>&1
-        example_code=$?
-        summary_check_problems $[${example_code_gpu} + ${example_code}] "${example_info_gpu}\n${example_info}"
+        exec_samplecode_checking
         assert_api_spec_approvals
         ;;
       build_and_check_cpu)
@@ -4282,15 +4394,7 @@ function main() {
         ;;
       build_and_check_gpu)
         set +e
-        example_info_gpu=""
-        example_code_gpu=0
-        if [ "${WITH_GPU}" == "ON" ] ; then
-            { example_info_gpu=$(exec_samplecode_test gpu 2>&1 1>&3 3>/dev/null); } 3>&1
-            example_code_gpu=$?
-        fi
-        { example_info=$(exec_samplecode_test cpu 2>&1 1>&3 3>/dev/null); } 3>&1
-        example_code=$?
-        summary_check_problems $[${example_code_gpu} + ${example_code}] "${example_info_gpu}\n${example_info}"
+        exec_samplecode_checking
         assert_api_spec_approvals
         ;;
       check_whl_size)
@@ -4395,6 +4499,9 @@ function main() {
         export FLAGS_PIR_OPTEST=True
         parallel_test true
         ;;
+      hyg_dcu_test)
+        parallel_test
+        ;;
       nv_cicheck_coverage)
         parallel_test
         nv_test
@@ -4416,10 +4523,6 @@ function main() {
         build ${parallel_number}
         run_brpc_test
         ;;
-      assert_api)
-        generate_upstream_develop_api_spec ${PYTHON_ABI:-""} ${parallel_number}
-        assert_api_spec_approvals
-        ;;
       test_inference)
         PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../../" && pwd )"
         if [ "${WITH_PYTHON}" == "OFF" ] ; then
@@ -4449,9 +4552,6 @@ function main() {
         gen_fluid_lib ${parallel_number}
         test_fluid_lib_train
         ;;
-      assert_api_approvals)
-        assert_api_spec_approvals
-        ;;
       assert_file_approvals)
         assert_file_diff_approvals
         ;;
@@ -4533,11 +4633,6 @@ function main() {
         build ${parallel_number}
         build_document_preview
         ;;
-      api_example)
-        { example_info=$(exec_samplecode_test cpu 2>&1 1>&3 3>/dev/null); } 3>&1
-        example_code=$?
-        summary_check_problems $example_code "$example_info"
-        ;;
       test_op_benchmark)
         test_op_benchmark
         ;;
diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index f016890ca3269..5dc084deeae6c 100755
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -98,7 +98,7 @@ function threads_config() {
       export OPENBLAS_MAIN_FREE=1
     fi
   fi
-  
+
 }
 
 PADDLE_CONF_HOME="$HOME/.config/paddle"
@@ -138,7 +138,7 @@ fi
 
 if [ "@WITH_GPU@" == "ON" ]; then
     PADDLE_NAME="paddlepaddle-gpu"
-else 
+else
     PADDLE_NAME="paddlepaddle"
 fi
 
diff --git a/paddle/scripts/windows_build/build.bat b/paddle/scripts/windows_build/build.bat
index 0aeacfef7f9bd..4ffec08e666e2 100644
--- a/paddle/scripts/windows_build/build.bat
+++ b/paddle/scripts/windows_build/build.bat
@@ -1,5 +1,5 @@
 @ECHO OFF
-SETLOCAL 
+SETLOCAL
 set source_path=%1
 set PYTHON_DIR=%2
 set WITH_GPU=%3
diff --git a/pyproject.toml b/pyproject.toml
index 4a4a5a73c5fda..dc9455167005e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -42,6 +42,9 @@ select = [
     # Pyupgrade
     "UP",
 
+    # Flake8-pyi
+    "PYI",
+
     # NumPy-specific rules
     "NPY001",
     "NPY003",
@@ -111,6 +114,10 @@ ignore = [
     "UP015",
     # It will cause the performance regression on python3.10
     "UP038",
+    # collections.namedtuple can be quickly created a inlined class
+    "PYI024",
+    # `__all__.append` is a common pattern in Paddle
+    "PYI056",
 ]
 
 [tool.ruff.lint.isort]
@@ -131,3 +138,34 @@ known-first-party = ["paddle"]
 "test/dygraph_to_static/test_loop.py" = ["C416", "F821"]
 # Ignore unnecessary lambda in dy2st unittest test_lambda
 "test/dygraph_to_static/test_lambda.py" = ["PLC3002"]
+# Ignore docstring in tensor.pyi
+"python/paddle/tensor/tensor.prototype.pyi" = ["PYI021", "PYI048"]
+
+[tool.mypy]
+python_version = "3.8"
+cache_dir = ".mypy_cache"
+# Miscellaneous strictness flags
+allow_redefinition = true
+local_partial_types = true
+strict = false
+# Untyped definitions and calls
+check_untyped_defs = true
+# Import discovery
+follow_imports = "normal"
+# Miscellaneous
+warn_unused_configs = true
+# Configuring warnings
+warn_redundant_casts = true
+warn_unused_ignores = true
+warn_no_return = true
+# Configuring error messages
+show_column_numbers = true
+
+[[tool.mypy.overrides]]
+module = [
+    "astor",
+    "cv2",
+    "scipy",
+    "xlsxwriter"
+]
+ignore_missing_imports = true
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index b3029a24309cf..16501a254f280 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -25,7 +25,7 @@ set(PY_FILES paddle/__init__.py ${UTILS_PY_FILES} ${FLUID_PY_FILES})
 if(WITH_GPU)
   set(PACKAGE_NAME "paddlepaddle-gpu")
 elseif(WITH_ROCM)
-  set(PACKAGE_NAME "paddlepaddle-rocm")
+  set(PACKAGE_NAME "paddlepaddle-dcu")
 elseif(WITH_XPU)
   set(PACKAGE_NAME "paddlepaddle-xpu")
 elseif(WITH_IPU)
@@ -173,17 +173,10 @@ endif()
 
 add_custom_target(paddle_python ALL
                   DEPENDS ${PADDLE_PYTHON_BUILD_DIR}/.timestamp)
+
 if(BUILD_WHL_PACKAGE AND NOT WITH_SETUP_INSTALL)
-  add_custom_target(
-    paddle_copy ALL
-    # generate tensor.pyi for type hints
-    COMMAND
-      ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python
-      ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/tools/gen_tensor_stub.py
-      --input-file
-      ${PADDLE_SOURCE_DIR}/python/paddle/tensor/tensor.prototype.pyi
-      --output-file ${PADDLE_BINARY_DIR}/python/paddle/tensor/tensor.pyi
-    DEPENDS ${PADDLE_PYTHON_BUILD_DIR}/.timestamp_wheel)
+  add_custom_target(paddle_copy ALL
+                    DEPENDS ${PADDLE_PYTHON_BUILD_DIR}/.timestamp_wheel)
   add_dependencies(paddle_copy paddle_python)
 endif()
 
diff --git a/python/env_dict.py.in b/python/env_dict.py.in
index 46c280e823df3..62822d4e16cf2 100644
--- a/python/env_dict.py.in
+++ b/python/env_dict.py.in
@@ -14,6 +14,8 @@ env_dict={
     'FLUID_CORE_NAME':'@FLUID_CORE_NAME@',
     'PHI_LIB':'@PHI_LIB@',
     'PHI_NAME':'@PHI_NAME@',
+    'PHI_KERNEL_GPU_LIB':'@PHI_KERNEL_GPU_LIB@',
+    'PHI_KERNEL_GPU_NAME':'@PHI_KERNEL_GPU_NAME@',
     'WITH_SHARED_PHI':'@WITH_SHARED_PHI@',
     'IR_LIB':'@IR_LIB@',
     'IR_NAME':'@IR_NAME@',
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 0cd36f299ecd6..37409b626009b 100644
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -14,12 +14,16 @@
 
 import typing
 
+__is_metainfo_generated = False
 try:
     from paddle.cuda_env import *  # noqa: F403
     from paddle.version import (  # noqa: F401
         commit as __git_commit__,
         full_version as __version__,
     )
+
+    __is_metainfo_generated = True
+
 except ImportError:
     import sys
 
@@ -272,6 +276,7 @@
     atleast_1d,
     atleast_2d,
     atleast_3d,
+    block_diag,
     broadcast_tensors,
     broadcast_to,
     cast,
@@ -433,6 +438,7 @@
     inner,
     inverse,
     isfinite,
+    isin,
     isinf,
     isnan,
     isneginf,
@@ -577,8 +583,7 @@
     if os.path.exists(cuh_file):
         os.environ.setdefault('runtime_include_dir', runtime_include_dir)
 
-
-if is_compiled_with_cuda():
+if __is_metainfo_generated and is_compiled_with_cuda():
     import os
     import platform
 
@@ -679,7 +684,9 @@
                 ctypes.CDLL('msvcp140.dll')
                 ctypes.CDLL('vcruntime140_1.dll')
             except OSError:
-                print(
+                import logging
+
+                logging.error(
                     '''Microsoft Visual C++ Redistributable is not installed, this may lead to the DLL load failure.
                         It can be downloaded at https://aka.ms/vs/16/release/vc_redist.x64.exe'''
                 )
@@ -699,7 +706,6 @@
             path_patched = False
             for dll in dlls:
                 is_loaded = False
-                print("dll:", dll)
                 if with_load_library_flags:
                     res = kernel32.LoadLibraryExW(dll, None, 0x00001100)
                     last_error = ctypes.get_last_error()
@@ -733,6 +739,7 @@
 ir_guard._switch_to_pir()
 
 __all__ = [
+    'block_diag',
     'iinfo',
     'finfo',
     'dtype',
@@ -846,6 +853,7 @@
     'squeeze_',
     'to_tensor',
     'gather_nd',
+    'isin',
     'isinf',
     'isneginf',
     'isposinf',
diff --git a/python/paddle/_typing/basic.py b/python/paddle/_typing/basic.py
index 4ecd20b74ebce..f6c7d92ac15ed 100644
--- a/python/paddle/_typing/basic.py
+++ b/python/paddle/_typing/basic.py
@@ -25,30 +25,10 @@
 Numberic: TypeAlias = Union[int, float, complex, np.number, "Tensor"]
 TensorLike: TypeAlias = Union[npt.NDArray[Any], "Tensor", Numberic]
 
-_T = TypeVar("_T", bound=Numberic)
-_SeqLevel1: TypeAlias = Sequence[_T]
-_SeqLevel2: TypeAlias = Sequence[Sequence[_T]]
-_SeqLevel3: TypeAlias = Sequence[Sequence[Sequence[_T]]]
-_SeqLevel4: TypeAlias = Sequence[Sequence[Sequence[Sequence[_T]]]]
-_SeqLevel5: TypeAlias = Sequence[Sequence[Sequence[Sequence[Sequence[_T]]]]]
-_SeqLevel6: TypeAlias = Sequence[
-    Sequence[Sequence[Sequence[Sequence[Sequence[_T]]]]]
-]
-
-IntSequence: TypeAlias = _SeqLevel1[int]
-
-NumbericSequence: TypeAlias = _SeqLevel1[Numberic]
-
-NestedSequence: TypeAlias = Union[
-    _T,
-    _SeqLevel1[_T],
-    _SeqLevel2[_T],
-    _SeqLevel3[_T],
-    _SeqLevel4[_T],
-    _SeqLevel5[_T],
-    _SeqLevel6[_T],
-]
+_T = TypeVar("_T")
 
+NestedSequence = Union[_T, Sequence["NestedSequence[_T]"]]
+IntSequence = Sequence[int]
+NumbericSequence = Sequence[Numberic]
 NestedNumbericSequence: TypeAlias = NestedSequence[Numberic]
-
 TensorOrTensors: TypeAlias = Union["Tensor", Sequence["Tensor"]]
diff --git a/python/paddle/_typing/shape.py b/python/paddle/_typing/shape.py
index 0193840119a66..235bfd6157c9b 100644
--- a/python/paddle/_typing/shape.py
+++ b/python/paddle/_typing/shape.py
@@ -13,23 +13,24 @@
 # limitations under the License.
 from __future__ import annotations
 
-from typing import List, Tuple, Union
+from typing import TYPE_CHECKING, List, Tuple, Union
 
 from typing_extensions import TypeAlias
 
-from .. import Tensor
+if TYPE_CHECKING:
+    from .. import Tensor
 
 DynamicShapeLike: TypeAlias = Union[
-    Tuple[Union[int, Tensor, None], ...],
-    List[Union[int, Tensor, None]],
-    Tensor,
+    Tuple[Union[int, "Tensor", None], ...],
+    List[Union[int, "Tensor", None]],
+    "Tensor",
 ]
 
 
 ShapeLike: TypeAlias = Union[
     Tuple[int, ...],
     List[int],
-    Tensor,
+    "Tensor",
 ]
 
 
diff --git a/python/paddle/amp/auto_cast.py b/python/paddle/amp/auto_cast.py
index 9ae60e5185ee0..34318f3cc9183 100644
--- a/python/paddle/amp/auto_cast.py
+++ b/python/paddle/amp/auto_cast.py
@@ -251,7 +251,7 @@ def _pir_transform(t, dtype):
                 param = op.operand(0).source()
                 cast_param = paddle.cast(param, dtype)
                 cast_param.persistable = True
-                paddle._pir_ops.updata_parameter(cast_param, t.name)
+                paddle._pir_ops.update_parameter(cast_param, t.name)
                 block.remove_op(op)
                 break
     main.set_parameters_from(startup)
diff --git a/python/paddle/autograd/backward_utils.py b/python/paddle/autograd/backward_utils.py
index 0649c3e19bf05..8266b9edc2009 100644
--- a/python/paddle/autograd/backward_utils.py
+++ b/python/paddle/autograd/backward_utils.py
@@ -29,20 +29,21 @@
 
 # TODO: Consider a better way to mark these ops has no grad op.
 # Such as use a new trait to mark these ops.
+# Please keep them as alphabetical order.
 ALLOW_NO_GRAD_OPS = [
     # Compare ops
     "pd_op.equal",
     "pd_op.equal_",
-    "pd_op.not_equal",
-    "pd_op.not_equal_",
-    "pd_op.less_than",
-    "pd_op.less_than_",
-    "pd_op.less_equal",
-    "pd_op.less_equal_",
     "pd_op.greater_than",
     "pd_op.greater_than_",
     "pd_op.greater_equal",
     "pd_op.greater_equal_",
+    "pd_op.less_than",
+    "pd_op.less_than_",
+    "pd_op.less_equal",
+    "pd_op.less_equal_",
+    "pd_op.not_equal",
+    "pd_op.not_equal_",
     # Logical ops
     "pd_op.logical_and",
     "pd_op.logical_and_",
@@ -67,35 +68,39 @@
     "pd_op.bitwise_xor_",
     # Array ops
     "pd_op.assign_array",
-    "pd_op.array_length",
-    "pd_op.slice_array",
-    "pd_op.slice_array_dense",
-    "pd_op.assign_array",
     "pd_op.assign_array_",
-    "pd_op.create_array",
-    "pd_op.create_array_like",
+    "pd_op.array_length",
+    "pd_op.array_pop",
     "pd_op.array_read",
     "pd_op.array_write_",
-    "pd_op.array_pop",
+    "pd_op.create_array",
+    "pd_op.create_array_like",
+    "pd_op.slice_array",
+    "pd_op.slice_array_dense",
     # Others
-    "pd_op.remainder",
-    "pd_op.argmax",
-    "pd_op.print",
     "pd_op.accuracy",
-    "pd_op.randint",
-    "pd_op.uniform",
-    "pd_op.gaussian",
+    "pd_op.all",
+    "pd_op.any",
+    "pd_op.argmax",
+    "pd_op.assign_value_",
     "pd_op.bernoulli",
+    "pd_op.distribute_fpn_proposals",
+    "pd_op.floor_divide",
     "pd_op.full_like",
-    "pd_op.assign_value_",
-    "pd_op.nextafter",
+    "pd_op.full_with_tensor",
+    "pd_op.gaussian",
     "pd_op.isnan",
     "pd_op.isinf",
-    "pd_op.all",
-    "pd_op.any",
+    "pd_op.nextafter",
+    "pd_op.nonzero",
+    "pd_op.one_hot",
+    "pd_op.print",
     "pd_op.prior_box",
+    "pd_op.randint",
+    "pd_op.remainder",
+    "pd_op.shape",
     "pd_op.share_data_",
-    "pd_op.floor_divide",
+    "pd_op.uniform",
 ]
 
 
@@ -113,6 +118,8 @@
     "pd_op.rsqrt",
     "pd_op.sigmoid",
     "pd_op.silu",
+    "pd_op.sum",
+    "pd_op.mean",
 ]
 
 
diff --git a/python/paddle/base/core.py b/python/paddle/base/core.py
index 4b00161bc3c82..f412a954c0bb0 100644
--- a/python/paddle/base/core.py
+++ b/python/paddle/base/core.py
@@ -506,7 +506,11 @@ def _test_use_sync(value):
 
 
 # ops in forward_blacklist will not be replaced by composite ops.
-prim_config = {"forward_blacklist": set(), "composite_ops_record": set()}
+prim_config = {
+    "forward_blacklist": set(),
+    "composite_ops_record": set(),
+    "backward_blacklist": set(),
+}
 
 
 def _get_batch_norm_none_var(op):
@@ -588,6 +592,7 @@ def _reset_prim_forward_blacklist():
 def _set_prim_backward_blacklist(*args):
     ops = set(args)
     for item in ops:
+        prim_config["backward_blacklist"].add(item)
         if not isinstance(item, str):
             raise TypeError("all items in set must belong to string")
     _set_bwd_prim_blacklist(ops)
@@ -671,3 +676,15 @@ def _check_and_set_prim_vjp_skip_default_ops():
 
 
 _check_and_set_prim_vjp_skip_default_ops()
+
+
+def _check_prim_vjp_ops():
+    ops_org = os.getenv("FLAGS_prim_backward_blacklist", "")
+    if ops_org:
+        ops = []
+        for item in ops_org.split(";"):
+            ops.append(item.strip())
+        _set_prim_backward_blacklist(*ops)
+
+
+_check_prim_vjp_ops()
diff --git a/python/paddle/base/executor.py b/python/paddle/base/executor.py
index fef11f5985ef1..74afe0d32ed85 100755
--- a/python/paddle/base/executor.py
+++ b/python/paddle/base/executor.py
@@ -1173,6 +1173,9 @@ def _get_pir_program_and_executor(self, cached_data):
         if core._enable_dist_prim_all():
             with decomp.prim_guard():
                 decomp.decompose_dist_program(program)
+        from paddle.base.libpaddle.pir import dump_pir_py_code_if_need
+
+        dump_pir_py_code_if_need(program, "pir_original_programs.py")
         return program, new_exe, data_op_infos
 
 
diff --git a/python/paddle/base/framework.py b/python/paddle/base/framework.py
index fcd69d0fd65d1..4b62b57f4e806 100644
--- a/python/paddle/base/framework.py
+++ b/python/paddle/base/framework.py
@@ -33,7 +33,6 @@
 import numpy as np
 
 import paddle
-import paddle.version as paddle_version
 
 from .. import pir
 from . import core, unique_name
@@ -573,10 +572,10 @@ def require_version(min_version, max_version=None):
             )
 
     version_installed = [
-        paddle_version.major,
-        paddle_version.minor,
-        paddle_version.patch,
-        paddle_version.rc,
+        paddle.version.major,
+        paddle.version.minor,
+        paddle.version.patch,
+        paddle.version.rc,
     ]
     zero_version = ["0", "0", "0", "0"]
 
@@ -591,13 +590,13 @@ def version_cmp(ver_a, ver_b):
     if version_cmp(version_installed, zero_version) == 0:
         if max_version is not None:
             warnings.warn(
-                f"PaddlePaddle version in [{min_version}, {max_version}] required, but {paddle_version.full_version} installed. "
+                f"PaddlePaddle version in [{min_version}, {max_version}] required, but {paddle.version.full_version} installed. "
                 "Maybe you are using a develop version, "
                 "please make sure the version is good with your code."
             )
         else:
             warnings.warn(
-                f"PaddlePaddle version {min_version} or higher is required, but {paddle_version.full_version} installed, "
+                f"PaddlePaddle version {min_version} or higher is required, but {paddle.version.full_version} installed, "
                 "Maybe you are using a develop version, "
                 "please make sure the version is good with your code."
             )
@@ -619,12 +618,12 @@ def version_cmp(ver_a, ver_b):
             or version_cmp(version_installed, min_version_to_check) < 0
         ):
             raise Exception(
-                f"VersionError: PaddlePaddle version in [{min_version}, {max_version}] required, but {paddle_version.full_version} installed."
+                f"VersionError: PaddlePaddle version in [{min_version}, {max_version}] required, but {paddle.version.full_version} installed."
             )
     else:
         if version_cmp(version_installed, min_version_to_check) < 0:
             raise Exception(
-                f"VersionError: PaddlePaddle version {min_version} or higher is required, but {paddle_version.full_version} installed, "
+                f"VersionError: PaddlePaddle version {min_version} or higher is required, but {paddle.version.full_version} installed, "
                 f"please upgrade your PaddlePaddle to {min_version} or other higher version."
             )
 
@@ -1617,6 +1616,9 @@ def __init__(
         if name is None:
             name = self.block.program._name_generator("_generated_var")
 
+            while self.block._find_var_recursive(name) is not None:
+                name = self.block.program._name_generator("_generated_var")
+
         if dtype is not None:
             dtype = convert_to_proto_type(dtype)
 
diff --git a/python/paddle/decomposition/decomp.py b/python/paddle/decomposition/decomp.py
index 6ffaebe444c9d..ab06767768271 100644
--- a/python/paddle/decomposition/decomp.py
+++ b/python/paddle/decomposition/decomp.py
@@ -850,13 +850,15 @@ def decompose_dist_program(pir_program):
     decompose(pir_program, [])
 
     # decomp backward ops
+    blacklist = core.prim_config["backward_blacklist"]
+
     block = pir_program.global_block()
+    pre_combine_op = None
     with paddle.pir.core.program_guard(pir_program):
         ops = pir_program.global_block().ops
         for op in ops:
             bwd_op_name = op.name()
-            # todo(CZ): to be removed
-            if bwd_op_name in ["pd_op.mean_grad", "pd_op.concat_grad"]:
+            if bwd_op_name.split(".")[-1] in blacklist:
                 continue
             skip_decomp = False
             if has_decomp_vjp(op):
@@ -867,13 +869,45 @@ def decompose_dist_program(pir_program):
                 if not skip_decomp:
                     pir.set_insertion_point(op)
                     orig_outs = op.results()
+
+                    is_next_split = False
                     decomp_outs = call_decomp_vjp(op)
-                    new_outs = _analyse_decomp_results(
-                        orig_outs, decomp_outs, op
-                    )
-                    op.replace_all_uses_with(new_outs)
+                    for i in range(len(orig_outs)):
+                        if orig_outs[i].has_one_use():
+                            next_op = orig_outs[i].first_use().owner()
+                            if next_op.name() == "builtin.split":
+                                is_next_split = True
+                                _check_op_results(
+                                    next_op.name(),
+                                    next_op.results(),
+                                    decomp_outs[i],
+                                )
+                                next_op.replace_all_uses_with(decomp_outs[i])
+                                block.remove_op(next_op)
+
+                    if not is_next_split:
+                        new_outs = _analyse_decomp_results(
+                            orig_outs, decomp_outs, op
+                        )
+                        _check_op_results(op.name(), orig_outs, new_outs)
+                        op.replace_all_uses_with(new_outs)
+
                     block.remove_op(op)
 
+                if op.name() == "builtin.combine":
+                    pre_combine_op = op
+
+                if pre_combine_op is not None:
+                    remove_op = True
+                    for item in pre_combine_op.results():
+                        if item.has_one_use():
+                            remove_op = False
+                            break
+                    if remove_op:
+                        block.remove_op(pre_combine_op)
+                    pre_combine_op = None
+    paddle.pir.set_insertion_point_to_block_end(block)
+
 
 def decompose_pir_program(pir_program, param_mapping, grad_var_to_var):
     '''
diff --git a/python/paddle/distributed/auto_parallel/static/engine.py b/python/paddle/distributed/auto_parallel/static/engine.py
index 7c2439a059a34..7faa92607719c 100644
--- a/python/paddle/distributed/auto_parallel/static/engine.py
+++ b/python/paddle/distributed/auto_parallel/static/engine.py
@@ -58,6 +58,7 @@
 from .pir_pass import (
     apply_partition_pass,
     apply_reshard_pass,
+    remove_other_rank_op_pass,
     remove_unuseful_comm_op_pass,
 )
 from .planner_v2 import Planner
@@ -696,6 +697,8 @@ def _parallel_pir(self, mode):
         #   collect the communicator created during resolution.
         apply_reshard_pass(dist_program)
 
+        remove_other_rank_op_pass(dist_program)
+
         # Part 4: Optimization Pass
         # NOTE Only those Optimization Pass that related to Parallelism (need dist attr) should be placed here and all the Pass should be Optional.
 
diff --git a/python/paddle/distributed/auto_parallel/static/parallelizer_v2.py b/python/paddle/distributed/auto_parallel/static/parallelizer_v2.py
index 130e80212f274..f5df914650c2c 100644
--- a/python/paddle/distributed/auto_parallel/static/parallelizer_v2.py
+++ b/python/paddle/distributed/auto_parallel/static/parallelizer_v2.py
@@ -478,9 +478,12 @@ def _apply_post_optimization(
             self._strategy.gradient_merge.avg = True
 
         # gradient_merge is then train-only optimization
+        grad_to_global_grad = {}
         if self.is_train and self._strategy.gradient_merge.enable:
             config = copy.deepcopy(self._strategy.gradient_merge.to_dict())
             config["dist_context"] = self._dist_context
+            config["grad_to_global_grad"] = grad_to_global_grad
+            config["pipeline_mode"] = self._strategy.pipeline.schedule_mode
             if gradient_sync_after_accumulate:
                 config["params_grads"] = global_params_grads
                 config[
@@ -557,4 +560,5 @@ def _apply_post_optimization(
                 "vpp_degree": self._strategy.pipeline.vpp_degree,
                 "dist_context": self._dist_context,
                 "split_backward": self._strategy.pipeline.split_backward,
+                "grad_to_global_grad": grad_to_global_grad,
             }
diff --git a/python/paddle/distributed/auto_parallel/static/pir_pass.py b/python/paddle/distributed/auto_parallel/static/pir_pass.py
index cae150f556967..17579dc1d5071 100644
--- a/python/paddle/distributed/auto_parallel/static/pir_pass.py
+++ b/python/paddle/distributed/auto_parallel/static/pir_pass.py
@@ -89,44 +89,6 @@ def apply_partition_pass(program):
                 var.replace_all_uses_with(reshard_var)
                 reshard_var.get_defining_op().operand(0).set_source(var)
 
-    # pruning op and value not belong to cur rank
-    cur_rank = paddle.distributed.get_rank()
-    for op in program.global_block().ops[::-1]:
-        if op.name() in partition_skip_op_list:
-            can_delete = True
-            for val in op.results():
-                if not val.use_empty():
-                    can_delete = False
-            if can_delete:
-                op.erase()
-            continue
-        if cur_rank not in op.dist_attr.process_mesh.process_ids:
-            op.erase()
-        else:
-            # set the operand as null when it is not belong to cur rank
-            if (
-                op.name() == 'dist_op.reshard'
-                and cur_rank
-                not in op.operand(0)
-                .source()
-                .dist_attr()
-                .process_mesh.process_ids
-            ):
-                op.operand(0).set_source(None)
-
-    # merge pd.data ops for
-    lr_ops = []
-    for op in program.global_block().ops[::-1]:
-        if op.name() == 'pd_op.data' and "learning_rate" in op.attrs()["name"]:
-            lr_ops.append(op)
-
-    if len(lr_ops) > 1:
-        lr_value = lr_ops[0].result(0)
-        for op in lr_ops[1:]:
-            lr = op.result(0)
-            lr.replace_all_uses_with(lr_value)
-            op.erase()
-
 
 def apply_reshard_pass(program):
     for op in program.global_block().ops:
@@ -160,6 +122,40 @@ def apply_reshard_pass(program):
                 op.erase()
 
 
+# pruning op and value not belong to cur rank
+def remove_other_rank_op_pass(dist_program):
+    cur_rank = paddle.distributed.get_rank()
+    for op in dist_program.global_block().ops[::-1]:
+        if op.name() in partition_skip_op_list:
+            can_delete = True
+            for val in op.results():
+                if not val.use_empty():
+                    can_delete = False
+            if can_delete:
+                op.erase()
+            continue
+        if cur_rank not in op.dist_attr.process_mesh.process_ids:
+            op.erase()
+        elif op.name() == "dist_op.reshard":
+            assert op.result(
+                0
+            ).use_empty(), f'There should not have useful dist.reshard op in remove_other_rank_op_pass. but find : {op}'
+            op.erase()
+
+    # merge pd.data ops for
+    lr_ops = []
+    for op in dist_program.global_block().ops[::-1]:
+        if op.name() == 'pd_op.data' and "learning_rate" in op.attrs()["name"]:
+            lr_ops.append(op)
+
+    if len(lr_ops) > 1:
+        lr_value = lr_ops[0].result(0)
+        for op in lr_ops[1:]:
+            lr = op.result(0)
+            lr.replace_all_uses_with(lr_value)
+            op.erase()
+
+
 # Note: this is the pass in the dense program
 comm_ops = ["pd_op.c_allreduce_sum_", "pd_op.c_allgather"]
 
@@ -172,6 +168,10 @@ def remove_unuseful_comm_op_pass(program):
             if process_group.nranks == 1:
                 op.result(0).replace_all_uses_with(op.operand_source(0))
                 op.erase()
+        if op.name() == "pd_op.share_data_":
+            if op.operand_source(0).has_one_use():
+                op.result(0).replace_all_uses_with(op.operand_source(0))
+                op.erase()
 
 
 # In sequence_parallel, we need to transpose hidden_states
diff --git a/python/paddle/distributed/auto_parallel/static/reshard_funcs/nd_mesh_reshard_func.py b/python/paddle/distributed/auto_parallel/static/reshard_funcs/nd_mesh_reshard_func.py
index cf4b9b7b32af1..bbc9b959b72db 100644
--- a/python/paddle/distributed/auto_parallel/static/reshard_funcs/nd_mesh_reshard_func.py
+++ b/python/paddle/distributed/auto_parallel/static/reshard_funcs/nd_mesh_reshard_func.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import copy
 
 import paddle
 import paddle.distributed as dist
@@ -67,10 +66,12 @@ def get_1D_sub_process_mesh(process_mesh, mesh_dim):
     process_ids = np.array(process_mesh.process_ids).reshape(mesh_shape)
 
     rank_id = dist.get_rank()
+    # FIXME (JZ-LIANG) Remove this hack to support any op mesh group for Pipeline Parallelism
+    if rank_id not in process_mesh.process_ids:
+        rank_id = process_mesh.process_ids[0]
     coord = list(np.where(process_ids == rank_id))
     coord[mesh_dim] = range(mesh_shape[mesh_dim])
     sub_process_ids = process_ids[tuple(coord)].flatten()
-    sub_mesh_shape = sub_process_ids.shape
     sub_mesh_name = dim_names[mesh_dim]
 
     return dist.ProcessMesh(sub_process_ids, [sub_mesh_name])
@@ -106,35 +107,31 @@ def reshard(self, src_dist_attr, dst_dist_attr, src_value, dst_type):
         first_diff_axis = find_first_diff_shard_axis(
             src_dist_attr, dst_dist_attr
         )
-        ori_dst_dist_attr = copy_dist_attr_with_new_member(dst_dist_attr)
-        out_value = src_value  # intermediate result
-        src_type = src_value.type()
+        # out_value = src_value  # intermediate result
+        # src_type = src_value.type()
         tensor_ndim = len(src_value.shape)
         process_mesh = dst_dist_attr.process_mesh
 
         # Step2. Convert the non-replicated dimensions to replicated.
         # Step2.1. convert partial status to replicated
-        real_out_dist_attr = copy_dist_attr_with_new_member(src_dist_attr)
         if is_partial(src_dist_attr):
-            in_partial_status = copy.deepcopy(src_dist_attr.partial_status)
+            in_partial_status = src_dist_attr.partial_status
             out_partial_status = dst_dist_attr.partial_status  # read-only
             # convert each partial dim to replicated with corresponding
             # 1-D mesh function
             for partial_dim, partial_type in in_partial_status.items():
-                if (
-                    partial_dim in out_partial_status
-                    or partial_dim in ori_dst_dist_attr.dims_mapping
-                ):
+                if partial_dim in out_partial_status:
                     continue
 
                 # get the partial status after converting
-                real_out_partial_status = copy.deepcopy(
-                    real_out_dist_attr.partial_status
+                tmp_partial_status = src_dist_attr.partial_status
+                tmp_partial_status.pop(partial_dim)
+                tmp_dst_dist_attr = copy_dist_attr_with_new_member(
+                    src_dist_attr,
+                    new_partial_status=tmp_partial_status,
                 )
-                real_out_partial_status.pop(partial_dim)
-                real_out_dist_attr = copy_dist_attr_with_new_member(
-                    real_out_dist_attr,
-                    new_partial_status=real_out_partial_status,
+                tmp_dst_type = paddle.base.libpaddle.pir.cvt_to_dist_type(
+                    src_value.type(), tmp_dst_dist_attr
                 )
 
                 # get the process_mesh on specific axis
@@ -160,28 +157,29 @@ def reshard(self, src_dist_attr, dst_dist_attr, src_value, dst_type):
                 )
 
                 one_dim_func = PToRReshardFunction()
-                out_value = one_dim_func.reshard(
+                src_value = one_dim_func.reshard(
                     in_one_dim_dist_attr,
                     out_one_dim_dist_attr,
-                    out_value,
-                    src_type,
+                    src_value,
+                    tmp_dst_type,
                 )
-
-                out_value.update_dist_attr(real_out_dist_attr)
+                src_dist_attr = tmp_dst_dist_attr
 
         # Step2.2 convert shard status to replicated
         for i in range(first_diff_axis, -1, -1):
-            in_mesh_axis = real_out_dist_attr.dims_mapping[i]
-            if in_mesh_axis == -1:
+            in_mesh_axis = src_dist_attr.dims_mapping[i]
+            out_mesh_axis = dst_dist_attr.dims_mapping[i]
+            if in_mesh_axis == -1 or in_mesh_axis == out_mesh_axis:
                 continue
 
             # calculate the dist_attr after converting
-            real_out_dims_mapping = copy.deepcopy(
-                real_out_dist_attr.dims_mapping
+            tmp_dims_mapping = src_dist_attr.dims_mapping
+            tmp_dims_mapping[i] = -1
+            tmp_dst_dist_attr = copy_dist_attr_with_new_member(
+                src_dist_attr, new_dims_mapping=tmp_dims_mapping
             )
-            real_out_dims_mapping[i] = -1
-            real_out_dist_attr = copy_dist_attr_with_new_member(
-                real_out_dist_attr, new_dims_mapping=real_out_dims_mapping
+            tmp_dst_type = paddle.base.libpaddle.pir.cvt_to_dist_type(
+                src_value.type(), tmp_dst_dist_attr
             )
 
             # get the process_mesh on specific axis
@@ -205,45 +203,41 @@ def reshard(self, src_dist_attr, dst_dist_attr, src_value, dst_type):
             )
 
             one_dim_func = SToRReshardFunction()
-            out_value = one_dim_func.reshard(
-                in_one_dim_dist_attr, out_one_dim_dist_attr, out_value, src_type
+            src_value = one_dim_func.reshard(
+                in_one_dim_dist_attr,
+                out_one_dim_dist_attr,
+                src_value,
+                tmp_dst_type,
             )
-
-            out_value.update_dist_attr(real_out_dist_attr)
+            src_dist_attr = tmp_dst_dist_attr
 
         # Step3. Convert the replicated status to the status in dst_dist_attr
         # Step3.1 convert replicated to partial
-        if is_partial(ori_dst_dist_attr):
-            in_partial_status = out_value.dist_attr.partial_status
-            out_partial_status = ori_dst_dist_attr.partial_status
+        if is_partial(dst_dist_attr):
+            in_partial_status = src_dist_attr.partial_status
+            out_partial_status = dst_dist_attr.partial_status
             for partial_dim, partial_type in out_partial_status.items():
                 if partial_dim in in_partial_status:
                     continue
-
                 raise NotImplementedError(
                     "RToPReshardFunction is not implemented"
                 )
 
-        # Step3.2 convert replicated/partial to shard
+        # Step3.2 convert replicated to shard
         for i in range(first_diff_axis, -1, -1):
-            out_mesh_axis = ori_dst_dist_attr.dims_mapping[i]
-            if out_mesh_axis == -1:
+            in_mesh_axis = src_dist_attr.dims_mapping[i]
+            out_mesh_axis = dst_dist_attr.dims_mapping[i]
+            if in_mesh_axis == out_mesh_axis:
                 continue
-            in_partial_status = out_value.dist_attr().partial_status
-            need_p2s = out_mesh_axis in in_partial_status
-            dims_mapping = copy.deepcopy(real_out_dist_attr.dims_mapping)
-            dims_mapping[i] = out_mesh_axis
-            partial_status = None
-            if out_mesh_axis in real_out_dist_attr.partial_status:
-                partial_status = copy.deepcopy(
-                    real_out_dist_attr.partial_status
-                )
-                partial_status.pop(out_mesh_axis)
 
-            real_out_dist_attr = copy_dist_attr_with_new_member(
-                real_out_dist_attr,
-                new_dims_mapping=dims_mapping,
-                new_partial_status=partial_status,
+            # calculate the dist_attr after converting
+            tmp_dims_mapping = src_dist_attr.dims_mapping
+            tmp_dims_mapping[i] = out_mesh_axis
+            tmp_dst_dist_attr = copy_dist_attr_with_new_member(
+                src_dist_attr, new_dims_mapping=tmp_dims_mapping
+            )
+            tmp_dst_type = paddle.base.libpaddle.pir.cvt_to_dist_type(
+                src_value.type(), tmp_dst_dist_attr
             )
 
             # get the process_mesh on specific axis
@@ -265,23 +259,15 @@ def reshard(self, src_dist_attr, dst_dist_attr, src_value, dst_type):
                     sub_mesh, out_one_dim_dims_mapping, {}
                 )
             )
-
-            if need_p2s:
-                raise NotImplementedError(
-                    "PToSReshardFunction is not implemented"
-                )
-            else:
-                one_dim_func = RToSReshardFunction()
-                out_value = one_dim_func.reshard(
-                    in_one_dim_dist_attr,
-                    out_one_dim_dist_attr,
-                    out_value,
-                    dst_type,
-                )
-                out_value.update_dist_attr(real_out_dist_attr)
-
-        out_value.set_type(dst_type)
-        return out_value
+            one_dim_func = RToSReshardFunction()
+            src_value = one_dim_func.reshard(
+                in_one_dim_dist_attr,
+                out_one_dim_dist_attr,
+                src_value,
+                tmp_dst_type,
+            )
+            src_dist_attr = tmp_dst_dist_attr
+        return src_value
 
 
 class NdMeshReshardFunctionCrossMesh(ReshardFunction):
@@ -310,20 +296,14 @@ def reshard(self, src_dist_attr, dst_dist_attr, src_value, dst_type):
         tmp_dst_type = paddle.base.libpaddle.pir.cvt_to_dist_type(
             src_value.type(), tmp_dist_attr
         )
-        out_value = same_status_func.reshard(
+        src_value = same_status_func.reshard(
             src_dist_attr, tmp_dist_attr, src_value, tmp_dst_type
         )
 
-        if out_value is None:
-            return None
-
-        curr_global_rank = paddle.distributed.get_rank()
-        if curr_global_rank in dst_dist_attr.process_mesh.process_ids:
-            nd_mesh_func = NdMeshReshardFunction()
-            assert nd_mesh_func.is_suitable(
-                tmp_dist_attr, dst_dist_attr
-            ), f"Invoke the p to r reshard function is not valid from {tmp_dist_attr} to {dst_dist_attr}"
-            return nd_mesh_func.reshard(
-                tmp_dist_attr, dst_dist_attr, out_value, dst_type
-            )
-        return None
+        nd_mesh_func = NdMeshReshardFunction()
+        assert nd_mesh_func.is_suitable(
+            tmp_dist_attr, dst_dist_attr
+        ), f"Invoke the p to r reshard function is not valid from {tmp_dist_attr} to {dst_dist_attr}"
+        return nd_mesh_func.reshard(
+            tmp_dist_attr, dst_dist_attr, src_value, dst_type
+        )
diff --git a/python/paddle/distributed/auto_parallel/static/reshard_funcs/p_to_r_reshard_func.py b/python/paddle/distributed/auto_parallel/static/reshard_funcs/p_to_r_reshard_func.py
index 8956cc2535d9b..d5046ff0f7963 100644
--- a/python/paddle/distributed/auto_parallel/static/reshard_funcs/p_to_r_reshard_func.py
+++ b/python/paddle/distributed/auto_parallel/static/reshard_funcs/p_to_r_reshard_func.py
@@ -47,7 +47,7 @@ def reshard(self, src_dist_attr, dst_dist_attr, src_value, dst_type):
             src_reduce_type = ReduceOp.SUM
             reduce_mean = True
 
-        group = new_process_group(src_mesh.process_ids)
+        group = new_process_group(sorted(src_mesh.process_ids))
         reduced_value = paddle._C_ops.c_allreduce_sum_(
             src_value, group.id, True, False
         )
@@ -95,20 +95,14 @@ def reshard(self, src_dist_attr, dst_dist_attr, src_value, dst_type):
         tmp_dst_type = paddle.base.libpaddle.pir.cvt_to_dist_type(
             src_value.type(), tmp_dist_attr
         )
-        out_value = same_status_func.reshard(
+        src_value = same_status_func.reshard(
             src_dist_attr, tmp_dist_attr, src_value, tmp_dst_type
         )
 
-        if out_value is None:
-            return None
-
-        curr_global_rank = paddle.distributed.get_rank()
-        if curr_global_rank in dst_dist_attr.process_mesh.process_ids:
-            p_to_r_func = PToRReshardFunction()
-            assert p_to_r_func.is_suitable(
-                tmp_dist_attr, dst_dist_attr
-            ), f"Invoke the p to r reshard function is not valid from {tmp_dist_attr} to {dst_dist_attr}"
-            return p_to_r_func.reshard(
-                tmp_dist_attr, dst_dist_attr, out_value, dst_type
-            )
-        return None
+        p_to_r_func = PToRReshardFunction()
+        assert p_to_r_func.is_suitable(
+            tmp_dist_attr, dst_dist_attr
+        ), f"Invoke the p to r reshard function is not valid from {tmp_dist_attr} to {dst_dist_attr}"
+        return p_to_r_func.reshard(
+            tmp_dist_attr, dst_dist_attr, src_value, dst_type
+        )
diff --git a/python/paddle/distributed/auto_parallel/static/reshard_funcs/r_to_s_reshard_func.py b/python/paddle/distributed/auto_parallel/static/reshard_funcs/r_to_s_reshard_func.py
index 922df440c5a21..e2999864f4e87 100644
--- a/python/paddle/distributed/auto_parallel/static/reshard_funcs/r_to_s_reshard_func.py
+++ b/python/paddle/distributed/auto_parallel/static/reshard_funcs/r_to_s_reshard_func.py
@@ -50,6 +50,19 @@ def reshard(self, src_dist_attr, dst_dist_attr, src_value, dst_type):
         if curr_global_rank in mesh.process_ids:
             total_nums = src_value.shape[split_axis]
             num_of_pieces = mesh.shape[mesh_axis]
+            if num_of_pieces == 1:
+                dst_value = paddle._C_ops.share_data_(src_value)
+                share_data_op = dst_value.get_defining_op()
+                # set dist type and dist attr
+                dst_value.set_type(dst_type)
+                share_data_op.dist_attr = (
+                    paddle.base.libpaddle.pir.create_op_dist_attribute(
+                        src_dist_attr.process_mesh,
+                        [src_dist_attr],
+                        [dst_dist_attr],
+                    )
+                )
+                return dst_value
             piece_len = (total_nums + num_of_pieces - 1) // num_of_pieces
             rank_relative = mesh.process_ids.index(curr_global_rank)
             start = rank_relative * piece_len
@@ -59,15 +72,17 @@ def reshard(self, src_dist_attr, dst_dist_attr, src_value, dst_type):
 
             out_value = paddle.slice(src_value, [split_axis], [start], [end])
 
-            out_value.set_type(src_value.type())
-            out_value.update_dist_attr(dst_dist_attr)
+            out_value.set_type(dst_type)
             out_value.get_defining_op().dist_attr = (
                 paddle.base.libpaddle.pir.create_op_dist_attribute(
                     mesh, [src_dist_attr], [dst_dist_attr]
                 )
             )
             return out_value
-        return None
+        # fake var will be removed in remove_other_rank_op_pass.
+        fake_var = paddle._C_ops.reshard_v2(src_value, dst_dist_attr)
+        fake_var.set_type(dst_type)
+        return fake_var
 
 
 class RToSReshardFunctionCrossMesh(ReshardFunction):
diff --git a/python/paddle/distributed/auto_parallel/static/reshard_funcs/s_to_r_reshard_func.py b/python/paddle/distributed/auto_parallel/static/reshard_funcs/s_to_r_reshard_func.py
index 5a907839cf78b..6c9c564cf6196 100644
--- a/python/paddle/distributed/auto_parallel/static/reshard_funcs/s_to_r_reshard_func.py
+++ b/python/paddle/distributed/auto_parallel/static/reshard_funcs/s_to_r_reshard_func.py
@@ -69,6 +69,18 @@ def infer_allgather_dist_type(self, in_value, split_axis):
         return out_type
 
     def reshard(self, src_dist_attr, dst_dist_attr, src_value, dst_type):
+        if src_dist_attr.process_mesh.size == 1:
+            dst_value = paddle._C_ops.share_data_(src_value)
+            share_data_op = dst_value.get_defining_op()
+            # set dist type and dist attr
+            dst_value.set_type(dst_type)
+            share_data_op.dist_attr = (
+                paddle.base.libpaddle.pir.create_op_dist_attribute(
+                    src_dist_attr.process_mesh, [src_dist_attr], [dst_dist_attr]
+                )
+            )
+            return dst_value
+
         def get_split_axis_with_dims_mapping(dims_mapping):
             split_axis = {}
             for idx, v in enumerate(dims_mapping):
@@ -102,8 +114,7 @@ def get_split_axis_with_dims_mapping(dims_mapping):
             return new_value
         else:
             # TODO(ywt01) support unbalanced split
-            pass
-        return None
+            raise NotImplementedError("unbalanced split is not implemented")
 
     def reshard_s_to_r_with_padding(
         self,
@@ -116,8 +127,8 @@ def reshard_s_to_r_with_padding(
     ):
         src_mesh = src_dist_attr.process_mesh
         num_of_process = len(src_mesh.process_ids)
-        dtype = src_value.dtype
-        group = new_process_group(src_mesh.process_ids)
+
+        group = new_process_group(sorted(src_mesh.process_ids))
         allgather_value = paddle._C_ops.c_allgather(
             src_value, group.id, num_of_process, True
         )
@@ -138,11 +149,32 @@ def reshard_s_to_r_with_padding(
 
         if split_axis != 0 or padding_num != 0:
             allgather_op = allgather_value.get_defining_op()
-            paddle.pir.set_insertion_point_after(allgather_op)
-            split_value = paddle._C_ops.split_with_num(
+            split_values = paddle._C_ops.split_with_num(
                 allgather_op.result(0), num_of_process, 0
             )
-            concat_value = paddle._C_ops.concat(split_value, split_axis)
+            builtin_split_op = split_values[0].get_defining_op()
+            pd_splite_op = builtin_split_op.operand_source(0).get_defining_op()
+
+            # fix the split_with_num dist attribtue.
+            new_inner_types = []
+            for sub_value in split_values:
+                new_inner_type = paddle.base.libpaddle.pir.cvt_to_dist_type(
+                    sub_value.type(), allgather_value.dist_attr()
+                )
+                new_inner_types.append(new_inner_type)
+                sub_value.set_type(new_inner_type)
+            vec_type = paddle.base.libpaddle.pir.create_vec_type(
+                new_inner_types
+            )
+            pd_splite_op.result(0).set_type(vec_type)
+
+            concat_value = paddle._C_ops.concat(split_values, split_axis)
+            # fold builtin.split op and builtin.combine op
+            concat_op = concat_value.get_defining_op()
+            builtin_combine_op = concat_op.operand_source(0).get_defining_op()
+            concat_op.operand(0).set_source(pd_splite_op.result(0))
+            builtin_combine_op.erase()
+            builtin_split_op.erase()
             return concat_value
         return allgather_value
 
@@ -183,16 +215,11 @@ def reshard(self, src_dist_attr, dst_dist_attr, src_value, dst_type):
         out_value = same_status_func.reshard(
             src_dist_attr, tmp_dist_attr, src_value, tmp_dst_type
         )
-        if out_value is None:
-            return None
-
-        curr_global_rank = paddle.distributed.get_rank()
-        if curr_global_rank in dst_dist_attr.process_mesh.process_ids:
-            s_to_r_func = SToRReshardFunction()
-            assert s_to_r_func.is_suitable(
-                tmp_dist_attr, dst_dist_attr
-            ), f"Invoke the p to r reshard function is not valid from {tmp_dist_attr} to {dst_dist_attr}"
-            return s_to_r_func.reshard(
-                tmp_dist_attr, dst_dist_attr, out_value, dst_type
-            )
-        return None
+
+        s_to_r_func = SToRReshardFunction()
+        assert s_to_r_func.is_suitable(
+            tmp_dist_attr, dst_dist_attr
+        ), f"Invoke the p to r reshard function is not valid from {tmp_dist_attr} to {dst_dist_attr}"
+        return s_to_r_func.reshard(
+            tmp_dist_attr, dst_dist_attr, out_value, dst_type
+        )
diff --git a/python/paddle/distributed/auto_parallel/static/reshard_funcs/same_status_reshard_func.py b/python/paddle/distributed/auto_parallel/static/reshard_funcs/same_status_reshard_func.py
index ceae2e7424fd6..db6ec8d1df238 100644
--- a/python/paddle/distributed/auto_parallel/static/reshard_funcs/same_status_reshard_func.py
+++ b/python/paddle/distributed/auto_parallel/static/reshard_funcs/same_status_reshard_func.py
@@ -87,11 +87,14 @@ def reshard(self, src_dist_attr, dst_dist_attr, src_value, dst_type):
                         dst_mesh, [], [dst_dist_attr]
                     )
                 )
-                recv_value.update_dist_attr(dst_dist_attr)
+                recv_value.set_type(dst_type)
                 is_send = False
                 break
 
         if is_send:
-            return None
+            # fake var will be removed in remove_other_rank_op_pass.
+            fake_var = paddle._C_ops.reshard_v2(src_value, dst_dist_attr)
+            fake_var.set_type(dst_type)
+            return fake_var
         else:
             return recv_value
diff --git a/python/paddle/distributed/auto_tuner/utils.py b/python/paddle/distributed/auto_tuner/utils.py
index 741120f7fe598..2db4cb6e0bdcc 100644
--- a/python/paddle/distributed/auto_tuner/utils.py
+++ b/python/paddle/distributed/auto_tuner/utils.py
@@ -1068,7 +1068,10 @@ def _gen_new_arg(arg, cmd, cfg, res_args, tuner_cfg):
                         prefix + str(cfg[arg]) if prefix else cfg[arg]
                     )
                 json.dump(cmd_cfg, open(cmd[arg][0], "w"))
-                if tuner_cfg["run_cmd"].get("generate_launch_cfg", True):
+                if (
+                    tuner_cfg["run_cmd"].get("generate_launch_cfg", True)
+                    and not run_best
+                ):
                     new_cmd_apth = (
                         os.path.splitext(cmd[arg][0])[0]
                         + "_"
@@ -1107,7 +1110,10 @@ def _gen_new_arg(arg, cmd, cfg, res_args, tuner_cfg):
                         prefix + str(cfg[arg]) if prefix else cfg[arg]
                     )
                 yaml.dump(cmd_cfg, open(cmd[arg][0], "w"))
-                if tuner_cfg["run_cmd"].get("generate_launch_cfg", True):
+                if (
+                    tuner_cfg["run_cmd"].get("generate_launch_cfg", True)
+                    and not run_best
+                ):
                     new_cmd_apth = (
                         os.path.splitext(cmd[arg][0])[0]
                         + cfg["log_dir_name"]
@@ -1157,7 +1163,10 @@ def _gen_new_arg(arg, cmd, cfg, res_args, tuner_cfg):
                 else:
                     cmd_cfg[keys[-1]] = rr_values
                 json.dump(cmd_cfg, open(cmd[arg][0], "w"))
-                if tuner_cfg["run_cmd"].get("generate_launch_cfg", True):
+                if (
+                    tuner_cfg["run_cmd"].get("generate_launch_cfg", True)
+                    and not run_best
+                ):
                     new_cmd_apth = (
                         os.path.splitext(cmd[arg][0])[0]
                         + cfg["log_dir_name"]
@@ -1198,7 +1207,10 @@ def _gen_new_arg(arg, cmd, cfg, res_args, tuner_cfg):
                 else:
                     cmd_cfg[keys[-1]] = rr_values
                 yaml.dump(cmd_cfg, open(cmd[arg][0], "w"))
-                if tuner_cfg["run_cmd"].get("generate_launch_cfg", True):
+                if (
+                    tuner_cfg["run_cmd"].get("generate_launch_cfg", True)
+                    and not run_best
+                ):
                     new_cmd_apth = (
                         os.path.splitext(cmd[arg][0])[0]
                         + cfg["log_dir_name"]
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
index 63f76416142c1..ce6154b1ca8db 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
@@ -119,6 +119,14 @@ def __init__(self, optimizer, hcg):
         self._broadcast_overlap = False
         self._forward_pre_hook_remove_helper = []
 
+        if (
+            paddle.is_compiled_with_xpu()
+            and os.getenv("XPU_CDNN_CLUSTER_PARALLEL") is not None
+        ):
+            assert (
+                not self.comm_overlap
+            ), "comm overlap not support when use xpu cdnn_cluster parallel."
+
         try:
             # The fp32 params such as layer_norm_0.w_0 will be at the end of param_list.
             # Have to sort the params to make sure all params are in the forward using order.
@@ -312,6 +320,14 @@ def reduce_gradients(self, parameter_list, hcg):
             for buffer in self._comm_buffers:
                 buffer.scale_grads()
             return
+
+        # sync here to guarantee cdnn_cluster parallel correct.
+        if (
+            paddle.is_compiled_with_xpu()
+            and os.getenv("XPU_CDNN_CLUSTER_PARALLEL") is not None
+        ):
+            paddle.device.synchronize()
+
         with framework.no_grad():
             for param in parameter_list:
                 g_var = self._get_param_grad(param)
@@ -624,6 +640,14 @@ def __init__(self, optimizer, hcg):
         self._set_inner_opt_attr('_parameter_list', self._local_parameter_list)
         self._set_inner_opt_attr('_param_groups', self._local_parameter_list)
 
+        if (
+            paddle.is_compiled_with_xpu()
+            and os.getenv("XPU_CDNN_CLUSTER_PARALLEL") is not None
+        ):
+            assert (
+                not self.comm_overlap
+            ), "comm overlap not support when use xpu cdnn_cluster parallel."
+
         # Ensure acc_steps is greater than 0 when comm_overlap is used
         if self.comm_overlap:
             assert (
@@ -739,6 +763,14 @@ def filter_parameters(self, parameter_list, hcg):
     def reduce_gradients(self, parameter_list, hcg):
         # TODO merge grad / nrank with dp
         logger.debug("sharding start gradients sync")
+
+        # sync here to guarantee cdnn_cluster parallel correct.
+        if (
+            paddle.is_compiled_with_xpu()
+            and os.getenv("XPU_CDNN_CLUSTER_PARALLEL") is not None
+        ):
+            paddle.device.synchronize()
+
         with framework.no_grad():
             for comm_buffer in self._comm_buffer_list:
                 if self.pp_release_grads and comm_buffer.grad_storage is None:
@@ -812,6 +844,7 @@ def copy_attr(attr_name):
         copy_attr("optimize_attr")
         copy_attr("do_model_average")
         copy_attr("need_clip")
+        copy_attr("no_sync")
 
         self._slice_params[param.name] = slice_param
         return slice_param
diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
index 33b8c3d95d582..db8c2f7b9b820 100644
--- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
@@ -24,6 +24,7 @@
     paddle.float32: "float32",
     paddle.float64: "float64",
     paddle.bfloat16: "bfloat16",
+    paddle.bool: "bool",
 }
 
 PADDLE_TO_NUMBER = {
@@ -33,6 +34,7 @@
     paddle.int32: 3,
     paddle.int64: 4,
     paddle.bfloat16: 5,
+    paddle.bool: 6,
 }
 
 NUMBER_TO_DTYPE = {
@@ -42,6 +44,7 @@
     3: "int32",
     4: "int64",
     5: "bfloat16",
+    6: "bool",
 }
 
 
diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
index 816af6f91530d..53d929c7890bd 100644
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -451,7 +451,9 @@ def check_layer_sparse(sublayer):
             return False
 
         is_sparse_gradient = [
-            check_layer_sparse(sublayer) for sublayer, _ in layers_param
+            check_layer_sparse(sublayer)
+            for sublayer, param in layers_param
+            if not getattr(param, "no_sync", False)
         ]
 
         if in_dynamic_mode():
diff --git a/python/paddle/distributed/passes/allreduce_matmul_grad_overlapping.py b/python/paddle/distributed/passes/allreduce_matmul_grad_overlapping.py
index 77affd4cd9c1e..e22cc5bbf6d65 100644
--- a/python/paddle/distributed/passes/allreduce_matmul_grad_overlapping.py
+++ b/python/paddle/distributed/passes/allreduce_matmul_grad_overlapping.py
@@ -138,8 +138,14 @@ def _split_matmul_grad_and_multi_streaming_allreduce(
                 name: allreduce_op.output(name) for name in allreduce_op_outputs
             }
 
+            # matmul_v2 + reshape + reshape + matmul_v2 + reshape + ... + original c_allreduce_sum
+            # =>
+            # matmul_v2 + new c_allreduce_sum + reshape + reshape + matmul_v2 + reshape + ... + original c_allreduce_sum
+            #
+            # NOTE(liym27): new c_allreduce_sum must be inserted to "the next of the first matmul_v2", otherwise another
+            # pass fused_linear_param_grad_add will not work.
             allreduce_op = block._insert_op_without_sync(
-                index=allreduce_id + 1,
+                index=matmul_grad_id + 1,
                 type=allreduce_op.type,
                 inputs=allreduce_op_inputs,
                 outputs=allreduce_op_outputs,
diff --git a/python/paddle/distributed/passes/auto_parallel_gradient_merge.py b/python/paddle/distributed/passes/auto_parallel_gradient_merge.py
index aab9bdb2456a0..2d7413965ae3b 100644
--- a/python/paddle/distributed/passes/auto_parallel_gradient_merge.py
+++ b/python/paddle/distributed/passes/auto_parallel_gradient_merge.py
@@ -523,6 +523,8 @@ def parse_program(
         dist_context,
     )
 
+    return grad_to_gradient_merge
+
 
 @register_pass("auto_parallel_gradient_merge_pass")
 class GradientMergePass(PassBase):
@@ -550,8 +552,9 @@ def _apply_single_impl(self, main_program, startup_program, context):
         gradient_sync_after_accumulate = self.get_attr(
             "gradient_sync_after_accumulate", False
         )
+        grad_to_global_grad = self.get_attr("grad_to_global_grad", {})
         with paddle.static.program_guard(main_program, startup_program):
-            parse_program(
+            grad_to_merge_grad = parse_program(
                 main_program,
                 startup_program,
                 params_grads,
@@ -562,3 +565,5 @@ def _apply_single_impl(self, main_program, startup_program, context):
             )
 
         main_program._sync_with_cpp()
+        for k, v in grad_to_merge_grad.items():
+            grad_to_global_grad[k] = v
diff --git a/python/paddle/distributed/passes/pipeline_scheduler_pass/pipeline_vpp.py b/python/paddle/distributed/passes/pipeline_scheduler_pass/pipeline_vpp.py
index 4fc9a1ec28692..8bc29411269ab 100644
--- a/python/paddle/distributed/passes/pipeline_scheduler_pass/pipeline_vpp.py
+++ b/python/paddle/distributed/passes/pipeline_scheduler_pass/pipeline_vpp.py
@@ -15,6 +15,10 @@
 import logging
 
 from paddle.base import core
+from paddle.distributed.auto_parallel.static.operators.common import (
+    is_data_parallel_reduce_op,
+    is_data_parallel_scale_op,
+)
 
 from ...utils.log_utils import get_logger
 from ..pass_base import register_pass
@@ -36,7 +40,8 @@
 class PipelineVirtualPipelinePass(PipelinePassBase):
     def __init__(self):
         super().__init__()
-
+        self._real_overlap_sharding_reduce = False
+        self.reduce_comm_suffix = "_reduce"
         self._forward_micro_step_counter = {}
         self._backward_micro_step_counter = {}
 
@@ -137,10 +142,22 @@ def _get_virtual_pp_rank(micro_step, forward):
         if real_split_backward:
             for chunk_id in range(num_model_chunks - 1, -1, -1):
                 for micro_batch_id in range(0, accumulate_steps):
-                    w_job = core.Job(BACKWARD + "_w" + str(chunk_id))
+                    if (
+                        self._real_overlap_sharding_reduce
+                        and micro_batch_id == accumulate_steps - 1
+                    ):
+                        w_job = core.Job(
+                            BACKWARD
+                            + "_w"
+                            + str(chunk_id)
+                            + self.reduce_comm_suffix
+                        )
+                    else:
+                        w_job = core.Job(BACKWARD + "_w" + str(chunk_id))
                     w_job.set_micro_batch_id(micro_batch_id)
                     job_list.append(w_job)
-
+        job_types = [job.type() for job in job_list]
+        logger.debug(f"The VPP job list: {job_types}")
         opt_job = core.Job(OPT)
         job_list.append(opt_job)
         return job_list
@@ -162,6 +179,102 @@ def _split_matmul_grad_ops_to_matmul(self, program, dist_context):
                     block, matmul_grad_id, dist_context=dist_context
                 )
 
+    def _move_sharding_comm_to_backward(
+        self, types, sub_programs, global_grads
+    ):
+        def _get_sharding_comm_op(op, idx, ops):
+            if is_data_parallel_reduce_op(op):
+                op_input_names = op.desc.input_arg_names()
+                op_output_names = op.desc.output_arg_names()
+                if (
+                    op_input_names[0] == op_output_names[0]
+                    and op_input_names[0] in global_grads
+                ):
+                    global_grad_to_comm_op[op_input_names[0]] = [op]
+                    remove_op_ids.append(idx)
+
+                if op.type in ["c_allreduce_sum", "c_reduce_sum"]:
+                    scale_index = idx + 1
+                    if scale_index < len(len(ops)):
+                        if is_data_parallel_scale_op(ops[scale_index]):
+                            global_grad_to_comm_op[op_input_names[0]].append(op)
+                            remove_op_ids.append(scale_index)
+
+        def _get_scale_op(op, idx):
+            if is_data_parallel_scale_op(op):
+                return
+            if op.type == 'scale':
+                op_input_names = op.desc.input_arg_names()
+                op_output_names = op.desc.output_arg_names()
+                if (
+                    op_input_names[0] == op_output_names[0]
+                    and op_input_names[0] in global_grads
+                ):
+                    global_grad_to_scale_op[op_input_names[0]] = op
+                    remove_op_ids.append(idx)
+
+        # 1 get the all sharding_avg in optimizer
+        type_programs = dict(zip(types, sub_programs))
+        opt_program = type_programs["optimizer"]
+        global_grad_to_comm_op = {}
+        global_grad_to_scale_op = {}
+        all_remove_op_ids = []
+        for cur_block in opt_program.blocks:
+            remove_op_ids = []
+            for idx, op in enumerate(cur_block.ops):
+                _get_scale_op(op, idx)
+                _get_sharding_comm_op(op, idx, cur_block.ops)
+            all_remove_op_ids.append(remove_op_ids)
+        if len(global_grad_to_comm_op) == 0:  # no need to overlap sharding comm
+            return False
+
+        # 2 create the new backward(w) with the sharding_comm
+        new_types = []
+        new_programs = []
+        for type, sub_program in type_programs.items():
+            if "backward_w" in type:
+                new_program = sub_program.clone()
+                cur_block = new_program.global_block()
+                cur_block_scale_op = []
+                for idx, op in reversed(list(enumerate(cur_block.ops))):
+                    if op.type == "elementwise_add":
+                        input_arg_names = op.input_arg_names
+                        output_arg_names = op.output_arg_names
+                        if (
+                            input_arg_names[0] == output_arg_names[0]
+                            and input_arg_names[0] in global_grad_to_comm_op
+                        ):
+                            for origin_op in reversed(
+                                global_grad_to_comm_op[input_arg_names[0]]
+                            ):
+                                new_op = cur_block._insert_op_without_sync(
+                                    index=idx + 1, type="nop"
+                                )
+                                new_op.desc.copy_from(origin_op.desc)
+                            del global_grad_to_comm_op[input_arg_names[0]]
+                            cur_block_scale_op.append(
+                                global_grad_to_scale_op[input_arg_names[0]]
+                            )
+                for origin_op in cur_block_scale_op:
+                    new_op = cur_block.append_op(type="nop")
+                    new_op.desc.copy_from(origin_op.desc)
+                cur_block._sync_with_cpp()
+                new_types.append(type + self.reduce_comm_suffix)
+                new_programs.append(new_program)
+        assert (
+            len(global_grad_to_comm_op) == 0
+        ), f"global_grad_to_comm_op must be used up, but left: {global_grad_to_comm_op}"
+
+        types.extend(new_types)
+        sub_programs.extend(new_programs)
+
+        for id, cur_block in enumerate(opt_program.blocks):
+            for op_id in reversed(all_remove_op_ids[id]):
+                cur_block._remove_op(op_id)
+            cur_block._sync_with_cpp()
+
+        return True
+
     def _partial_programs(self, program):
         dist_context = self.get_attr("dist_context")
         num_model_chunks = self.get_attr("vpp_degree")
@@ -169,7 +282,10 @@ def _partial_programs(self, program):
         accumulate_steps = self.get_attr("num_micro_batches")
         num_stages = self.get_attr("pp_degree")
         split_backward = self.get_attr("split_backward", False)
-
+        grad_to_global_grad = self.get_attr("grad_to_global_grad", {})
+        global_grads = [
+            global_grad for _, global_grad in grad_to_global_grad.items()
+        ]
         if split_backward and accumulate_steps == num_stages:
             self._split_matmul_grad_ops_to_matmul(program, dist_context)
             types, sub_program_list = _program_for_vpp_split_bwk(
@@ -178,6 +294,11 @@ def _partial_programs(self, program):
                 dist_context,
                 enable_send_recv_overlap,
             )
+            self._real_overlap_sharding_reduce = (
+                self._move_sharding_comm_to_backward(
+                    types, sub_program_list, global_grads
+                )
+            )
         else:
             types, sub_program_list = _program_for_vpp(
                 program,
diff --git a/python/paddle/distribution/__init__.py b/python/paddle/distribution/__init__.py
index 246c4ffb71173..168fbc460d5bd 100644
--- a/python/paddle/distribution/__init__.py
+++ b/python/paddle/distribution/__init__.py
@@ -34,6 +34,7 @@
 from .multivariate_normal import MultivariateNormal
 from .normal import Normal
 from .poisson import Poisson
+from .student_t import StudentT
 from .transform import (  # noqa:F401
     AbsTransform,
     AffineTransform,
@@ -77,6 +78,7 @@
     'Geometric',
     'Binomial',
     'Poisson',
+    'StudentT',
 ]
 
 __all__.extend(transform.__all__)
diff --git a/python/paddle/distribution/student_t.py b/python/paddle/distribution/student_t.py
new file mode 100644
index 0000000000000..d1a88887023ff
--- /dev/null
+++ b/python/paddle/distribution/student_t.py
@@ -0,0 +1,277 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from collections.abc import Sequence
+
+import paddle
+from paddle.base.data_feeder import check_type, convert_dtype
+from paddle.base.framework import Variable
+from paddle.distribution import Gamma, distribution
+from paddle.framework import in_dynamic_mode
+
+
+class StudentT(distribution.Distribution):
+    r"""
+    The StudentT distribution with parameters: `df`, `loc`, `scale`.
+
+    In probability theory and statistics, the StudentT distribution is one of the basic continuous probability distributions
+    defined on the real number set.
+
+    The probability density function (pdf) is
+
+    .. math::
+
+        pdf(x; \nu, \mu, \sigma) = \frac{\Gamma[(\nu+1)/2]}{\sigma\sqrt{\nu\pi}\Gamma(\nu/2)[1+(\frac{x-\mu}{\sigma})^2/\nu]^{(1+\nu)/2}}
+
+    In the above equation:
+
+    * :math:`df = \nu`: is the degree of freedom.
+    * :math:`loc = \mu`: is the center parameter.
+    * :math:`scale = \sigma`: is the scale parameter.
+    * :math:`\Gamma(\cdot)`: is the gamma function.
+
+    Args:
+        df (float|Tensor): The degree of freedom of the distribution, which should be non-negative. If the input data type is float,
+            the data type of `df` will be converted to a 1-D Tensor with paddle global default dtype. Supported dtype: float32, float64.
+        loc (float|Tensor): The center of the distribution. If the input data type is float, the data type of `loc` will be converted to a
+            1-D Tensor with paddle global default dtype. Supported dtype: float32, float64.
+        scale (float|Tensor): The scale of the distribution, which should be non-negative. If the input data type is float, the data type
+            of `scale` will be converted to a 1-D Tensor with paddle global default dtype. Supported dtype: float32, float64.
+        name(str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Examples:
+        .. code-block:: python
+
+            >>> import paddle
+            >>> from paddle.distribution import StudentT
+            >>> paddle.set_device('cpu')
+            >>> paddle.seed(100)
+            >>> dist = StudentT(df=10.0, loc=0.0, scale=1.0)
+            >>> dist.sample([3])
+            Tensor(shape=[3, 1], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[-2.07709980],
+             [ 0.27981189],
+             [ 0.00881413]])
+
+            >>> dist2 = StudentT(df=paddle.to_tensor([10.0, 5.0]), loc=paddle.to_tensor([0.0, 0.0]), scale=paddle.to_tensor([1.0, 2.0]))
+            >>> value_tensor = paddle.to_tensor([0.8], dtype="float32")
+            >>> lp = dist2.log_prob(value_tensor)
+            >>> print(lp)
+            Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [-1.28509235, -1.75626254])
+
+            >>> p = dist2.prob(value_tensor)
+            >>> print(p)
+            Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [0.27662504, 0.17268908])
+
+            >>> entropy = dist2.entropy()
+            >>> print(entropy)
+            Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [1.52126312, 2.32064891])
+
+    """
+
+    def __init__(self, df, loc, scale, name=None):
+        if not in_dynamic_mode():
+            check_type(
+                df,
+                'df',
+                (
+                    float,
+                    Variable,
+                    paddle.pir.Value,
+                ),
+                'StudentT',
+            )
+            check_type(
+                loc,
+                'loc',
+                (
+                    float,
+                    Variable,
+                    paddle.pir.Value,
+                ),
+                'StudentT',
+            )
+            check_type(
+                scale,
+                'scale',
+                (
+                    float,
+                    Variable,
+                    paddle.pir.Value,
+                ),
+                'StudentT',
+            )
+
+        self.name = name if name is not None else 'StudentT'
+        self.dtype = paddle.get_default_dtype()
+
+        if self._validate_args(df, loc, scale):
+            self.df = df
+            self.loc = loc
+            self.scale = scale
+            self.df, self.loc, self.scale = paddle.broadcast_tensors(
+                [self.df, self.loc, self.scale]
+            )
+            self.dtype = convert_dtype(df.dtype)
+        else:
+            self.df, self.loc, self.scale = self._to_tensor(df, loc, scale)
+
+        if not self._check_nonnegative(self.df):
+            raise ValueError(
+                'Every element of input parameter `df` should be nonnegative.'
+            )
+        if not self._check_nonnegative(self.scale):
+            raise ValueError(
+                'Every element of input parameter `scale` should be nonnegative.'
+            )
+
+        if self.df.shape == []:
+            self.df = self.df.reshape([1])
+            self.loc = self.loc.reshape([1])
+            self.scale = self.scale.reshape([1])
+        batch_shape = self.df.shape
+        super().__init__(batch_shape)
+        self._chi2 = Gamma(0.5 * self.df, paddle.full_like(self.df, 0.5))
+
+    def _check_nonnegative(self, value):
+        """Check the non-negative constraint for input parameters
+
+        Args:
+            value (Tensor)
+
+        Returns:
+            bool: pass or not.
+        """
+        return (value >= 0.0).all()
+
+    @property
+    def mean(self):
+        """Mean of StudentT distribution.
+
+        Returns:
+            Tensor: mean value.
+        """
+        return paddle.where(
+            self.df > 1.0,
+            self.loc,
+            paddle.full_like(self.loc, fill_value=float('nan')),
+        )
+
+    @property
+    def variance(self):
+        """Variance of StudentT distribution.
+
+        Returns:
+            Tensor: variance value.
+        """
+        var = self.df.clone().detach()
+        var_condition = self.df > 2.0
+        var = paddle.where(
+            var_condition,
+            self.scale.pow(2) * var / (var - 2),
+            paddle.full_like(var, fill_value=float('nan')),
+        )
+        inf_condition = (self.df <= 2.0).logical_and(self.df > 1.0)
+        var = paddle.where(
+            inf_condition, paddle.full_like(var, fill_value=float('inf')), var
+        )
+        return var
+
+    def sample(self, shape=()):
+        """Generate StudentT samples of the specified shape. The final shape would be ``shape+batch_shape`` .
+
+        Args:
+            shape (Sequence[int], optional): Prepended shape of the generated samples.
+
+        Returns:
+            Tensor: Sampled data with shape `sample_shape` + `batch_shape`.
+        """
+        if not isinstance(shape, Sequence):
+            raise TypeError('sample shape must be Sequence object.')
+
+        output_shape = self._extend_shape(shape)
+        z = paddle.cast(paddle.normal(shape=output_shape), self.dtype)
+        chi2 = self._chi2.sample(shape)
+        x = z * paddle.rsqrt(chi2 / self.df)
+        return self.loc + self.scale * x
+
+    def entropy(self):
+        r"""Shannon entropy in nats.
+
+        The entropy is
+
+        .. math::
+
+            H = \log(\frac{\Gamma(\nu/2)\Gamma(1/2) \sigma \sqrt{\nu}}{\Gamma[(1+\nu)/2]}) + \frac{(1+\nu)}{2} \cdot \{\psi[(1+\nu)/2] - \psi(\nu/2)\}
+
+        In the above equation:
+
+        * :math:`\nu`: is the degree of freedom.
+        * :math:`\Gamma()`: is the gamma function.
+        * :math:`\psi()`: is the digamma function.
+
+        Returns:
+            Tensor: Shannon entropy of StudentT distribution. The data type is the same as `df`.
+        """
+        lbeta = (
+            paddle.lgamma(0.5 * self.df)
+            + math.lgamma(0.5)
+            - paddle.lgamma(0.5 * (self.df + 1))
+        )
+        return (
+            self.scale.log()
+            + 0.5
+            * (self.df + 1)
+            * (
+                paddle.digamma(0.5 * (self.df + 1))
+                - paddle.digamma(0.5 * self.df)
+            )
+            + 0.5 * self.df.log()
+            + lbeta
+        )
+
+    def log_prob(self, value):
+        """Log probability density function.
+
+        Args:
+          value (Tensor): The input tensor.
+
+        Returns:
+          Tensor: log probability density. The data type is the same as `df`.
+        """
+        value = self._check_values_dtype_in_probs(self.df, value)
+        y = (value - self.loc) / self.scale
+        Z = (
+            self.scale.log()
+            + 0.5 * self.df.log()
+            + 0.5 * math.log(math.pi)
+            + paddle.lgamma(0.5 * self.df)
+            - paddle.lgamma(0.5 * (self.df + 1.0))
+        )
+        return -0.5 * (self.df + 1.0) * paddle.log1p(y**2.0 / self.df) - Z
+
+    def prob(self, value):
+        """Probability density function.
+
+        Args:
+            value (Tensor): The input tensor.
+
+        Returns:
+            Tensor: probability density. The data type is the same as `df`.
+        """
+        return paddle.exp(self.log_prob(value))
diff --git a/python/paddle/incubate/autograd/functional.py b/python/paddle/incubate/autograd/functional.py
index 8ae915a1e4868..b2f19ee83f806 100644
--- a/python/paddle/incubate/autograd/functional.py
+++ b/python/paddle/incubate/autograd/functional.py
@@ -582,7 +582,7 @@ def _grad(ys, xs, v=None):
         # xs_grad when the xs is a single Tensor.
         xs_grad = paddle.grad(ys, xs, v, create_graph=True, allow_unused=True)
         if (
-            isinstance(xs, paddle.base.framework.Variable)
+            isinstance(xs, (paddle.base.framework.Variable, paddle.pir.Value))
             and isinstance(xs_grad, typing.Sequence)
             and len(xs_grad) > 0
         ):
@@ -658,23 +658,27 @@ def _check_inputs(func, xs, v=None):
     if not callable(func):
         raise TypeError(f"Expected 'fun' is Callable, but got {type(func)}.")
 
-    if not isinstance(xs, (framework.Variable, typing.Sequence)):
+    if not isinstance(
+        xs, (framework.Variable, typing.Sequence, paddle.pir.Value)
+    ):
         raise TypeError(
             f"Expected 'xs' is a Tensor|Sequence[Tensor],"
             f"but got {type(xs)}."
         )
     if isinstance(xs, typing.Sequence) and not all(
-        isinstance(x, framework.Variable) for x in xs
+        isinstance(x, (framework.Variable, paddle.pir.Value)) for x in xs
     ):
         raise TypeError("All elements of 'xs' should be Tensor.")
 
-    if not isinstance(v, (framework.Variable, typing.Sequence, type(None))):
+    if not isinstance(
+        v, (framework.Variable, typing.Sequence, type(None), paddle.pir.Value)
+    ):
         raise TypeError(
             f"Expected 'v' is Tensor|Sequence[Tensor]|None, but got {type(v)}."
         )
 
     if isinstance(v, typing.Sequence) and not all(
-        isinstance(e, framework.Variable) for e in v
+        isinstance(e, (framework.Variable, paddle.pir.Value)) for e in v
     ):
         raise TypeError("All elements of 'xs' should be Tensor.")
 
diff --git a/python/paddle/incubate/autograd/primapi.py b/python/paddle/incubate/autograd/primapi.py
index 109cde97a75ca..a33e1f4dfb8de 100644
--- a/python/paddle/incubate/autograd/primapi.py
+++ b/python/paddle/incubate/autograd/primapi.py
@@ -72,13 +72,17 @@ def forward_grad(outputs, inputs, grad_inputs=None):
             'operators, use enable_prim to turn it on.'
         )
 
-    if not isinstance(outputs, (framework.Variable, typing.Sequence)):
+    if not isinstance(
+        outputs, (framework.Variable, typing.Sequence, paddle.pir.Value)
+    ):
         raise TypeError(
             f'Expected outputs is Tensor|Sequence[Tensor], '
             f'but got {type(outputs)}.'
         )
 
-    if not isinstance(inputs, (framework.Variable, typing.Sequence)):
+    if not isinstance(
+        inputs, (framework.Variable, typing.Sequence, paddle.pir.Value)
+    ):
         raise TypeError(
             f'Expected inputs is Tensor|Sequence[Tensor], '
             f'but got {type(inputs)}.'
@@ -101,7 +105,11 @@ def forward_grad(outputs, inputs, grad_inputs=None):
     ad = primx.Transform(ys[0].block)
     _, ys_dot = ad.linearize(xs, ys, xs_dot)
 
-    return ys_dot[0] if isinstance(outputs, framework.Variable) else ys_dot
+    return (
+        ys_dot[0]
+        if isinstance(outputs, (framework.Variable, paddle.pir.Value))
+        else ys_dot
+    )
 
 
 @framework.static_only
@@ -155,7 +163,7 @@ def grad(outputs, inputs, grad_outputs=None):
         # The follow code snippet fixes the problem by return the first element
         # of grad_inputs when the inputs is a single Tensor.
         if (
-            isinstance(inputs, framework.Variable)
+            isinstance(inputs, (framework.Variable, paddle.pir.Value))
             and isinstance(grad_inputs, typing.Sequence)
             and len(grad_inputs) > 0
         ):
@@ -163,13 +171,17 @@ def grad(outputs, inputs, grad_outputs=None):
         else:
             return grad_inputs
 
-    if not isinstance(outputs, (framework.Variable, typing.Sequence)):
+    if not isinstance(
+        outputs, (framework.Variable, typing.Sequence, paddle.pir.Value)
+    ):
         raise TypeError(
             f'Expected outputs is Tensor|Sequence[Tensor], '
             f'but got {type(outputs)}.'
         )
 
-    if not isinstance(inputs, (framework.Variable, typing.Sequence)):
+    if not isinstance(
+        inputs, (framework.Variable, typing.Sequence, paddle.pir.Value)
+    ):
         raise TypeError(
             f'Expected inputs is Tensor|Sequence[Tensor], '
             f'but got {type(inputs)}.'
@@ -213,7 +225,11 @@ def grad(outputs, inputs, grad_outputs=None):
     ad.erase_ops(sorted(op_indexes))
     ad.erase_dots(xs_dot)
 
-    return xs_bar[0] if isinstance(inputs, framework.Variable) else xs_bar
+    return (
+        xs_bar[0]
+        if isinstance(inputs, (framework.Variable, paddle.pir.Value))
+        else xs_bar
+    )
 
 
 @framework.static_only
diff --git a/python/paddle/incubate/autograd/primx.py b/python/paddle/incubate/autograd/primx.py
index 901e23a649974..ba2f0c2e615a1 100644
--- a/python/paddle/incubate/autograd/primx.py
+++ b/python/paddle/incubate/autograd/primx.py
@@ -137,8 +137,12 @@ def add(self, key_var, value_var):
     def add_rec(self, key_vars, value_vars):
         if value_vars is None:
             return
-        if isinstance(key_vars, paddle.base.framework.Variable):
-            if not isinstance(value_vars, paddle.base.framework.Variable):
+        if isinstance(
+            key_vars, (paddle.base.framework.Variable, paddle.pir.Value)
+        ):
+            if not isinstance(
+                value_vars, (paddle.base.framework.Variable, paddle.pir.Value)
+            ):
                 raise TypeError(
                     f'value_vars must be Variable, but got {type(value_vars)}'
                 )
@@ -208,7 +212,9 @@ def add_vars(self, new_vars):
     def add_vars_rec(self, new_vars):
         if new_vars is None:
             return
-        if isinstance(new_vars, paddle.base.framework.Variable):
+        if isinstance(
+            new_vars, (paddle.base.framework.Variable, paddle.pir.Value)
+        ):
             self.vars.update({id(new_vars): new_vars})
             return
         if not isinstance(new_vars, list):
@@ -242,7 +248,7 @@ def erase_dots(self, vars_to_erase):
 
     def var2dot_rec(self, vars):
         """Lookup var2dot recursively."""
-        if isinstance(vars, paddle.base.framework.Variable):
+        if isinstance(vars, (paddle.base.framework.Variable, paddle.pir.Value)):
             dot = self.var2dot.lookup(vars)
             return dot
 
@@ -250,7 +256,7 @@ def var2dot_rec(self, vars):
         return dots
 
     def dot2bar_rec(self, dots):
-        if isinstance(dots, paddle.base.framework.Variable):
+        if isinstance(dots, (paddle.base.framework.Variable, paddle.pir.Value)):
             bar = self.dot2bar.lookup(dots)
             assert bar is not None, 'bar must be not None'
             return bar
@@ -385,7 +391,9 @@ def bind(args, to_bind, value_table):
         for i in range(len(args)):
             if isinstance(args[i], list):
                 bind(args[i], to_bind, value_table)
-            if not isinstance(args[i], paddle.base.framework.Variable):
+            if not isinstance(
+                args[i], (paddle.base.framework.Variable, paddle.pir.Value)
+            ):
                 continue
             elif args[i] is not None and args[i].name in to_bind:
                 args[i] = value_table[to_bind[args[i].name]]
diff --git a/python/paddle/incubate/autograd/utils.py b/python/paddle/incubate/autograd/utils.py
index b5bc0c6238ea7..0518071a90040 100644
--- a/python/paddle/incubate/autograd/utils.py
+++ b/python/paddle/incubate/autograd/utils.py
@@ -309,7 +309,9 @@ def map_output_for_composite(op):
 
 
 def flatten(inp):
-    if inp is None or isinstance(inp, paddle.base.framework.Variable):
+    if inp is None or isinstance(
+        inp, (paddle.base.framework.Variable, paddle.pir.Value)
+    ):
         return [inp]
     flattened = []
     for part in inp:
@@ -323,7 +325,7 @@ def flatten_and_remove_none(inp):
 
 
 def as_tensors(xs):
-    if isinstance(xs, framework.Variable):
+    if isinstance(xs, (framework.Variable, paddle.pir.Value)):
         return (xs,)
     elif isinstance(xs, typing.Sequence):
         return tuple(xs)
diff --git a/python/paddle/incubate/nn/functional/block_multihead_attention.py b/python/paddle/incubate/nn/functional/block_multihead_attention.py
index a55f61de2c678..596b9581570ad 100644
--- a/python/paddle/incubate/nn/functional/block_multihead_attention.py
+++ b/python/paddle/incubate/nn/functional/block_multihead_attention.py
@@ -389,3 +389,156 @@ def block_multihead_attention(
         },
     )
     return out, qkv, key_cache, value_cache
+
+
+def block_multihead_attention_xpu(
+    qkv,
+    key_cache,
+    value_cache,
+    seq_lens_encoder,
+    seq_lens_decoder,
+    seq_lens_this_time,
+    padding_offsets,
+    cum_offsets,
+    cu_seqlens_q,
+    cu_seqlens_k,
+    block_tables,
+    cache_k_per_batch_maxs,
+    cache_v_per_batch_maxs,
+    pre_key_cache=None,
+    pre_value_cache=None,
+    cache_k_quant_scales=None,
+    cache_v_quant_scales=None,
+    cache_k_dequant_scales=None,
+    cache_v_dequant_scales=None,
+    qkv_out_scale=None,
+    qkv_bias=None,
+    out_shift=None,
+    out_smooth=None,
+    max_enc_len_this_time=None,
+    max_dec_len_this_time=None,
+    rope_emb=None,
+    mask=None,
+    tgt_mask=None,
+    max_seq_len=-1,
+    block_size=64,
+    use_neox_style=False,
+    use_dynamic_cachekv_quant=False,
+    quant_round_type=1,
+    quant_max_bound=127.0,
+    quant_min_bound=-127.0,
+    out_scale=-1,
+    compute_dtype="default",
+):
+    if in_dynamic_mode():
+        return _C_ops.block_multihead_attention_xpu(
+            qkv,
+            key_cache,
+            value_cache,
+            seq_lens_encoder,
+            seq_lens_decoder,
+            seq_lens_this_time,
+            padding_offsets,
+            cum_offsets,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            block_tables,
+            cache_k_per_batch_maxs,
+            cache_v_per_batch_maxs,
+            pre_key_cache,
+            pre_value_cache,
+            rope_emb,
+            mask,
+            tgt_mask,
+            cache_k_quant_scales,
+            cache_v_quant_scales,
+            cache_k_dequant_scales,
+            cache_v_dequant_scales,
+            qkv_out_scale,
+            qkv_bias,
+            out_shift,
+            out_smooth,
+            max_enc_len_this_time,
+            max_dec_len_this_time,
+            max_seq_len,
+            block_size,
+            use_neox_style,
+            use_dynamic_cachekv_quant,
+            quant_round_type,
+            quant_max_bound,
+            quant_min_bound,
+            out_scale,
+            compute_dtype,
+        )
+
+    helper = LayerHelper('block_multihead_attention_xpu', **locals())
+    out = helper.create_variable_for_type_inference(dtype=qkv.dtype)
+
+    inputs = {}
+    inputs['qkv'] = qkv
+    inputs['key_cache'] = key_cache
+    inputs['value_cache'] = value_cache
+    inputs['seq_lens_encoder'] = seq_lens_encoder
+    inputs['seq_lens_decoder'] = seq_lens_decoder
+    inputs['seq_lens_this_time'] = seq_lens_this_time
+    inputs['padding_offsets'] = padding_offsets
+    inputs['cum_offsets'] = cum_offsets
+    inputs['cu_seqlens_q'] = cu_seqlens_q
+    inputs['cu_seqlens_k'] = cu_seqlens_k
+    inputs['block_tables'] = block_tables
+    inputs['cache_k_per_batch_maxs'] = cache_k_per_batch_maxs
+    inputs['cache_v_per_batch_maxs'] = cache_v_per_batch_maxs
+    if pre_key_cache is not None:
+        inputs['pre_key_cache'] = pre_key_cache
+    if pre_value_cache is not None:
+        inputs['pre_value_cache'] = pre_value_cache
+    if rope_emb is not None:
+        inputs['rope_emb'] = rope_emb
+    if mask is not None:
+        inputs['mask'] = mask
+    if tgt_mask is not None:
+        inputs['tgt_mask'] = tgt_mask
+    if cache_k_quant_scales is not None:
+        inputs["cache_k_quant_scales"] = cache_k_quant_scales
+    if cache_v_quant_scales is not None:
+        inputs["cache_v_quant_scales"] = cache_v_quant_scales
+    if cache_k_dequant_scales is not None:
+        inputs["cache_k_dequant_scales"] = cache_k_dequant_scales
+    if cache_v_dequant_scales is not None:
+        inputs["cache_v_dequant_scales"] = cache_v_dequant_scales
+    if qkv_out_scale is not None:
+        inputs["qkv_out_scale"] = qkv_out_scale
+    if qkv_bias is not None:
+        inputs["qkv_bias"] = qkv_bias
+    if out_shift is not None:
+        inputs["out_shift"] = out_shift
+    if out_smooth is not None:
+        inputs["out_smooth"] = out_smooth
+    if max_enc_len_this_time is not None:
+        inputs["max_enc_len_this_time"] = max_enc_len_this_time
+    if max_dec_len_this_time is not None:
+        inputs["max_dec_len_this_time"] = max_dec_len_this_time
+
+    outputs = {
+        'fmha_out': out,
+        'qkv_out': qkv,
+        'key_cache_out': key_cache,
+        'value_cache_out': value_cache,
+    }
+    helper.append_op(
+        type='block_multihead_attention_xpu',
+        inputs=inputs,
+        outputs=outputs,
+        attrs={
+            'max_seq_len': max_seq_len,
+            'block_size': block_size,
+            'use_neox_style': use_neox_style,
+            'dynamic_cachekv_quant': use_dynamic_cachekv_quant,
+            'quant_round_type': quant_round_type,
+            'quant_max_bound': quant_max_bound,
+            'quant_min_bound': quant_min_bound,
+            'out_scale': out_scale,
+            'compute_dtype': compute_dtype,
+        },
+    )
+    return out, qkv, key_cache, value_cache
diff --git a/python/paddle/jit/api.py b/python/paddle/jit/api.py
index 56a0d8a613be6..d80737b0646e3 100644
--- a/python/paddle/jit/api.py
+++ b/python/paddle/jit/api.py
@@ -23,10 +23,21 @@
 import types
 import warnings
 from collections import OrderedDict
+from collections.abc import Callable, Sequence
 from contextlib import contextmanager
-from typing import Any
+from types import ModuleType
+from typing import (
+    Any,
+    Protocol,
+    TypedDict,
+    TypeVar,
+    overload,
+)
+
+from typing_extensions import Literal, NotRequired, ParamSpec, TypeAlias, Unpack
 
 import paddle
+from paddle._typing import NestedSequence
 from paddle.base import core, dygraph
 from paddle.base.compiler import (
     BuildStrategy,
@@ -45,6 +56,7 @@
 from paddle.base.wrapped_decorator import wrap_decorator
 from paddle.framework import use_pir_api
 from paddle.nn import Layer
+from paddle.static import InputSpec
 from paddle.static.io import save_inference_model
 from paddle.utils.environments import (
     BooleanEnvironmentVariable,
@@ -71,6 +83,11 @@
 
 ENV_ENABLE_SOT = BooleanEnvironmentVariable("ENABLE_FALL_BACK", True)
 
+_LayerT = TypeVar("_LayerT", bound=Layer)
+_RetT = TypeVar("_RetT")
+_InputT = ParamSpec("_InputT")
+Backends: TypeAlias = Literal["CINN"]
+
 
 @contextmanager
 def sot_mode_guard(value: bool):
@@ -98,13 +115,13 @@ def copy_decorator_attrs(original_func, decorated_obj):
     return decorated_obj
 
 
-def ignore_module(modules: list[Any]):
+def ignore_module(modules: list[ModuleType]) -> None:
     """
     Adds modules that ignore transcription.
     Builtin modules that have been ignored are collections, pdb, copy, inspect, re, numpy, logging, six
 
     Args:
-        modules (List[Any]): Ignored modules that you want to add
+        modules (list[ModuleType]): Ignored modules that you want to add
 
     Examples:
         .. code-block:: python
@@ -133,6 +150,67 @@ def _check_and_set_backend(backend, build_strategy):
         build_strategy.build_cinn_pass = True
 
 
+class ToStaticOptions(TypedDict):
+    property: NotRequired[bool]
+    full_graph: NotRequired[bool]
+
+
+class ToStaticDecorator(Protocol):
+    @overload
+    def __call__(self, function: _LayerT) -> _LayerT:
+        ...
+
+    @overload
+    def __call__(
+        self, function: Callable[_InputT, _RetT]
+    ) -> StaticFunction[_InputT, _RetT]:
+        ...
+
+
+@overload
+def to_static(
+    function: _LayerT,
+    input_spec: NestedSequence[InputSpec] | None = ...,
+    build_strategy: BuildStrategy | None = ...,
+    backend: Backends | None = ...,
+    **kwargs: Unpack[ToStaticOptions],
+) -> _LayerT:
+    ...
+
+
+@overload
+def to_static(
+    function: Callable[_InputT, _RetT],
+    input_spec: NestedSequence[InputSpec] | None = ...,
+    build_strategy: BuildStrategy | None = ...,
+    backend: Backends | None = ...,
+    **kwargs: Unpack[ToStaticOptions],
+) -> StaticFunction[_InputT, _RetT]:
+    ...
+
+
+@overload
+def to_static(
+    function: Any,
+    input_spec: NestedSequence[InputSpec] | None = ...,
+    build_strategy: BuildStrategy | None = ...,
+    backend: Backends | None = ...,
+    **kwargs: Unpack[ToStaticOptions],
+) -> Any:
+    ...
+
+
+@overload
+def to_static(
+    function: None = ...,
+    input_spec: NestedSequence[InputSpec] | None = ...,
+    build_strategy: BuildStrategy | None = ...,
+    backend: Backends | None = ...,
+    **kwargs: Unpack[ToStaticOptions],
+) -> ToStaticDecorator:
+    ...
+
+
 def to_static(
     function=None,
     input_spec=None,
@@ -254,6 +332,28 @@ def decorated(python_func):
     return decorated
 
 
+class NotToStaticDecorator(Protocol):
+    @overload
+    def __call__(
+        self, func: Callable[_InputT, _RetT]
+    ) -> Callable[_InputT, _RetT]:
+        ...
+
+    @overload
+    def __call__(self, func: None = ...) -> NotToStaticDecorator:
+        ...
+
+
+@overload
+def not_to_static(func: Callable[_InputT, _RetT]) -> Callable[_InputT, _RetT]:
+    ...
+
+
+@overload
+def not_to_static(func: None = ...) -> NotToStaticDecorator:
+    ...
+
+
 def not_to_static(func=None):
     """
     A Decorator to suppresses the convention of a function.
@@ -337,14 +437,12 @@ def output_spec(self, spec):
             return
         if not isinstance(spec, list):
             raise TypeError(
-                "The config `output_spec` should be 'list', but received input type is %s."
-                % type(input)
+                f"The config `output_spec` should be 'list', but received input type is {type(input)}."
             )
             for var in spec:
                 if not isinstance(var, core.eager.Tensor):
                     raise TypeError(
-                        "The element in config `output_spec` list should be 'Variable', but received element's type is %s."
-                        % type(var)
+                        f"The element in config `output_spec` list should be 'Variable', but received element's type is {type(var)}."
                     )
         self._output_spec = spec
 
@@ -358,8 +456,7 @@ def model_filename(self, filename):
             return
         if not isinstance(filename, str):
             raise TypeError(
-                "The config `model_filename` should be str, but received input's type is %s."
-                % type(filename)
+                f"The config `model_filename` should be str, but received input's type is {type(filename)}."
             )
         if len(filename) == 0:
             raise ValueError("The config `model_filename` is empty string.")
@@ -375,8 +472,7 @@ def params_filename(self, filename):
             return
         if not isinstance(filename, str):
             raise TypeError(
-                "The config `params_filename` should be str, but received input's type is %s."
-                % type(filename)
+                f"The config `params_filename` should be str, but received input's type is {type(filename)}."
             )
         if len(filename) == 0:
             raise ValueError("The config `params_filename` is empty string.")
@@ -392,13 +488,22 @@ def keep_name_table(self, value):
             return
         if not isinstance(value, bool):
             raise TypeError(
-                "The config `keep_name_table` should be bool value, but received input's type is %s."
-                % type(value)
+                f"The config `keep_name_table` should be bool value, but received input's type is {type(value)}."
             )
         self._keep_name_table = value
 
 
-def _parse_save_configs(configs):
+class _SaveLoadOptions(TypedDict):
+    output_spec: NotRequired[Sequence[InputSpec]]
+    with_hook: NotRequired[bool]
+    combine_params: NotRequired[bool]
+    clip_extra: NotRequired[bool]
+    skip_forward: NotRequired[bool]
+    input_names_after_prune: NotRequired[list[str]]
+    skip_prune_program: NotRequired[bool]
+
+
+def _parse_save_configs(configs: _SaveLoadOptions):
     supported_configs = [
         "output_spec",
         "with_hook",
@@ -413,8 +518,7 @@ def _parse_save_configs(configs):
     for key in configs:
         if key not in supported_configs:
             raise ValueError(
-                "The additional config (%s) of `paddle.jit.save` is not supported."
-                % (key)
+                f"The additional config ({key}) of `paddle.jit.save` is not supported."
             )
 
     # construct inner config
@@ -439,8 +543,7 @@ def _parse_load_config(configs):
     for key in configs:
         if key not in supported_configs:
             raise ValueError(
-                "The additional config (%s) of `paddle.jit.load` is not supported."
-                % (key)
+                f"The additional config ({key}) of `paddle.jit.load` is not supported."
             )
 
     # construct inner config
@@ -554,7 +657,7 @@ def _get_output_vars(outputs, output_spec, with_hook=False):
             output_size = len(result_list)
             if len(output_spec) == output_size:
                 for var in output_spec:
-                    if not isinstance(var, paddle.pir.Value, int):
+                    if not isinstance(var, (paddle.pir.Value, int)):
                         warnings.warn(output_spec_is_not_value_error % var.name)
                     else:
                         if var not in ValueSet(result_list):
@@ -636,9 +739,9 @@ def _build_load_path_and_config(path, config):
         )
     elif not prefix_format_exist and not directory_format_exist:
         raise ValueError(
-            "The ``path`` (%s) to load model not exists. "
+            f"The ``path`` ({path}) to load model not exists. "
             "Please make sure that *.pdmodel exists or "
-            "don't using ``skip_forward=True`` to jit.save." % path
+            "don't using ``skip_forward=True`` to jit.save."
         )
     else:
         if prefix_format_exist:
@@ -802,7 +905,12 @@ def set_property(meta, key, val):
 
 @_run_save_pre_hooks
 @switch_to_static_graph
-def save(layer, path, input_spec=None, **configs):
+def save(
+    layer: Callable[_InputT, _RetT],
+    path: str,
+    input_spec: InputSpec | None = None,
+    **configs: Unpack[_SaveLoadOptions],
+) -> None:
     """
     Saves input Layer or function as ``paddle.jit.TranslatedLayer``
     format model, which can be used for inference or fine-tuning after loading.
@@ -954,8 +1062,7 @@ def save(layer, path, input_spec=None, **configs):
         isinstance(layer, (Layer, StaticFunction)) or inspect.isfunction(layer)
     ):
         raise TypeError(
-            "The input of paddle.jit.save should be 'Layer' or 'Function', but received input type is %s."
-            % type(layer)
+            f"The input of paddle.jit.save should be 'Layer' or 'Function', but received input type is {type(layer)}."
         )
     elif inspect.isfunction(layer) or isinstance(layer, StaticFunction):
         warnings.warn(
@@ -996,14 +1103,12 @@ def save(layer, path, input_spec=None, **configs):
                     and 'forward' != attr_func
                 ):
                     raise ValueError(
-                        "If there are static functions other than 'forward' that need to be saved, the input 'input_spec' should be None, but received the type of 'input_spec' is %s."
-                        % type(input_spec)
+                        f"If there are static functions other than 'forward' that need to be saved, the input 'input_spec' should be None, but received the type of 'input_spec' is {type(input_spec)}."
                     )
 
         if not isinstance(input_spec, (list, tuple)):
             raise TypeError(
-                "The input input_spec should be 'list', but received input_spec's type is %s."
-                % type(input_spec)
+                f"The input input_spec should be 'list', but received input_spec's type is {type(input_spec)}."
             )
         inner_input_spec = []
         for var in paddle.utils.flatten(input_spec):
@@ -1372,7 +1477,9 @@ def save(layer, path, input_spec=None, **configs):
 
 
 @dygraph_only
-def load(path, **configs):
+def load(
+    path: str, **configs: Unpack[_SaveLoadOptions]
+) -> TranslatedLayer | PirTranslatedLayer:
     """
     :api_attr: imperative
 
diff --git a/python/paddle/jit/dy2static/ast_utils.py b/python/paddle/jit/dy2static/ast_utils.py
index fc703dd6f6e49..7c4c90ec44d0e 100644
--- a/python/paddle/jit/dy2static/ast_utils.py
+++ b/python/paddle/jit/dy2static/ast_utils.py
@@ -27,8 +27,7 @@ def ast_to_source_code(ast_node):
     """
     if not isinstance(ast_node, (gast.AST, ast.AST)):
         raise TypeError(
-            "Type of ast_root should be gast.AST or ast.AST, but received %s."
-            % type(ast_node)
+            f"Type of ast_root should be gast.AST or ast.AST, but received {type(ast_node)}."
         )
     if isinstance(ast_node, gast.AST):
         ast_node = gast.gast_to_ast(ast_node)
diff --git a/python/paddle/jit/dy2static/convert_operators.py b/python/paddle/jit/dy2static/convert_operators.py
index 7ef8b4ce88736..10d2c9633ae80 100644
--- a/python/paddle/jit/dy2static/convert_operators.py
+++ b/python/paddle/jit/dy2static/convert_operators.py
@@ -615,8 +615,7 @@ def convert_len(var):
             return paddle.tensor.array_length(var)
         else:
             raise TypeError(
-                'len(var) only supports LoDTensor/LoDTensorArray/SelectedRows, but received %s.'
-                % type(var)
+                f'len(var) only supports LoDTensor/LoDTensorArray/SelectedRows, but received {type(var)}.'
             )
     elif isinstance(var, Value):
         if var.is_dense_tensor_type() or var.is_selected_row_type():
diff --git a/python/paddle/jit/dy2static/function_spec.py b/python/paddle/jit/dy2static/function_spec.py
index 7d5605f547df8..ce0b8382e9d01 100644
--- a/python/paddle/jit/dy2static/function_spec.py
+++ b/python/paddle/jit/dy2static/function_spec.py
@@ -179,7 +179,7 @@ def pir_to_static_inputs_with_spec(self, input_with_spec, main_program):
                 if isinstance(var_spec, paddle.static.InputSpec):
                     stop_gradient = getattr(var_spec, 'stop_gradient', False)
                     feed_value = paddle.static.input.data(
-                        name=var_spec.name or "feed_%s" % i,
+                        name=var_spec.name or f"feed_{i}",
                         shape=var_spec.shape,
                         dtype=convert_dtype(var_spec.dtype),
                     )
@@ -232,7 +232,7 @@ def to_static_inputs_with_spec(self, input_with_spec, main_program):
                 stop_gradient = getattr(var_spec, 'stop_gradient', False)
                 feed_layer = block.create_var(
                     # TODO(Aurelius84): consider a more elegant way to name this
-                    name=var_spec.name or "feed_%s" % i,
+                    name=var_spec.name or f"feed_{i}",
                     shape=var_spec.shape,
                     dtype=var_spec.dtype,
                     is_data=True,
diff --git a/python/paddle/jit/dy2static/logging_utils.py b/python/paddle/jit/dy2static/logging_utils.py
index d9e20b2a81d5c..837c3efae442d 100644
--- a/python/paddle/jit/dy2static/logging_utils.py
+++ b/python/paddle/jit/dy2static/logging_utils.py
@@ -180,7 +180,7 @@ def _output_to_stdout(self, msg, *args):
 _TRANSLATOR_LOGGER = TranslatorLogger()
 
 
-def set_verbosity(level=0, also_to_stdout=False):
+def set_verbosity(level: int = 0, also_to_stdout: bool = False) -> None:
     """
     Sets the verbosity level of log for dygraph to static graph. Logs can be output to stdout by setting `also_to_stdout`.
 
@@ -215,11 +215,13 @@ def set_verbosity(level=0, also_to_stdout=False):
     _TRANSLATOR_LOGGER.need_to_echo_log_to_stdout = also_to_stdout
 
 
-def get_verbosity():
+def get_verbosity() -> int:
     return _TRANSLATOR_LOGGER.verbosity_level
 
 
-def set_code_level(level=LOG_AllTransformer, also_to_stdout=False):
+def set_code_level(
+    level: int = LOG_AllTransformer, also_to_stdout: bool = False
+) -> None:
     """
     Sets the level to print code from specific level Ast Transformer. Code can be output to stdout by setting `also_to_stdout`.
 
diff --git a/python/paddle/jit/dy2static/partial_program.py b/python/paddle/jit/dy2static/partial_program.py
index 8571740db2659..f4fc6ea387f97 100644
--- a/python/paddle/jit/dy2static/partial_program.py
+++ b/python/paddle/jit/dy2static/partial_program.py
@@ -1108,8 +1108,7 @@ def _check_params_all_inited(self, main_program):
         """
         if not isinstance(self._params, (list, tuple)):
             raise TypeError(
-                "Type of self._params in PartialProgramLayer should be list or tuple, but received %s."
-                % type(self._params)
+                f"Type of self._params in PartialProgramLayer should be list or tuple, but received {type(self._params)}."
             )
 
         param_and_buffer_names_set = set()
@@ -1127,12 +1126,11 @@ def _check_params_all_inited(self, main_program):
                     if name not in param_and_buffer_names_set:
                         raise ValueError(
                             "\n\tWe don't support to define layer with parameters in the function decorated by `@to_static`."
-                            "\n\tBut we found parameter(%s) was created in the decorated function."
+                            f"\n\tBut we found parameter({name}) was created in the decorated function."
                             "\n"
                             "\n\tRevise suggestion: "
                             "\n\t\t1. Please ensure all your sublayers are inherited from nn.Layer."
                             "\n\t\t2. Please use nn.ParameterList and nn.LayerList as container instead of using a native Python container such as List"
-                            % name
                         )
 
     def _valid_vars(self, vars):
diff --git a/python/paddle/jit/dy2static/pir_partial_program.py b/python/paddle/jit/dy2static/pir_partial_program.py
index 55d8ab47e92a4..ff6ee46c8a1f9 100644
--- a/python/paddle/jit/dy2static/pir_partial_program.py
+++ b/python/paddle/jit/dy2static/pir_partial_program.py
@@ -1257,8 +1257,7 @@ def _check_params_all_inited(self, main_program):
         """
         if not isinstance(self._params, (list, tuple)):
             raise TypeError(
-                "Type of self._params in PartialProgramLayer should be list or tuple, but received %s."
-                % type(self._params)
+                f"Type of self._params in PartialProgramLayer should be list or tuple, but received {type(self._params)}."
             )
 
         param_and_buffer_names_set = set()
diff --git a/python/paddle/jit/dy2static/program_translator.py b/python/paddle/jit/dy2static/program_translator.py
index ea4040485b64a..ac50ba8b5f50c 100644
--- a/python/paddle/jit/dy2static/program_translator.py
+++ b/python/paddle/jit/dy2static/program_translator.py
@@ -19,11 +19,14 @@
 import threading
 import warnings
 import weakref
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any, Callable, Generic, TypeVar
+
+from typing_extensions import ParamSpec, Self
 
 import paddle
 import paddle.pir.core as ir_static
 from paddle import decomposition, get_flags
+from paddle._typing import NestedSequence
 from paddle.base import core, framework
 from paddle.base.data_feeder import check_type
 from paddle.base.dygraph.base import (
@@ -35,6 +38,7 @@
 from paddle.nn.layer import layers
 from paddle.pir import Value
 from paddle.pir.core import _convert_into_value, static_op_arg_cast_guard
+from paddle.static import InputSpec, Program
 from paddle.utils import flatten, gast
 
 from . import error, logging_utils
@@ -49,8 +53,9 @@
     create_and_update_origin_info_map,
     update_op_callstack_with_origin_info,
 )
-from .partial_program import PartialProgramLayerHook
+from .partial_program import PartialProgramLayer, PartialProgramLayerHook
 from .pir_partial_program import (
+    PartialProgramLayer as PirPartialProgramLayer,
     PartialProgramLayerHook as PirPartialProgramLayerHook,
 )
 from .transformers import DygraphToStaticAst
@@ -72,6 +77,9 @@
 if TYPE_CHECKING:
     from paddle.static.amp.fp16_utils import AmpOptions
 
+_RetT = TypeVar("_RetT")
+_InputT = ParamSpec("_InputT")
+
 __all__ = []
 
 # For each traced function, we set `max_traced_program_count` = 10 to consider caching performance.
@@ -318,7 +326,7 @@ def unwrap_decorators(func):
     return decorators, cur
 
 
-class StaticFunction:
+class StaticFunction(Generic[_InputT, _RetT]):
     def __init__(self, function, input_spec=None, **kwargs):
         """
         Initializes a `StaticFunction`.
@@ -374,7 +382,7 @@ def __init__(self, function, input_spec=None, **kwargs):
         self._property = kwargs.get("property", False)
         self._get_debug_name()
 
-    def _get_debug_name(self):
+    def _get_debug_name(self) -> str:
         try:
             if self._class_instance:
                 self._debug_name = self._class_instance.__class__.__name__
@@ -384,11 +392,11 @@ def _get_debug_name(self):
             self._debug_name = "static_function"
 
     @property
-    def is_property(self):
+    def is_property(self) -> bool:
         # whether is class proproty to be exported.
         return self._property
 
-    def train(self):
+    def train(self) -> None:
         if (
             isinstance(self._class_instance, layers.Layer)
             and self._class_instance.training is False
@@ -399,7 +407,7 @@ def train(self):
             )
         self._training = True
 
-    def eval(self):
+    def eval(self) -> None:
         if (
             isinstance(self._class_instance, layers.Layer)
             and self._class_instance.training is True
@@ -452,12 +460,12 @@ def forward(self, x, y):
 
         return self._descriptor_cache[instance]
 
-    def _clone(self):
+    def _clone(self) -> Self:
         return self.__class__(
             self.dygraph_function, self._input_spec, **self._kwargs
         )
 
-    def __call__(self, *args, **kwargs):
+    def __call__(self, *args: _InputT.args, **kwargs: _InputT.kwargs) -> _RetT:
         """
         Supports to call the returned instance with input `args` and `kwargs` directly.
 
@@ -493,7 +501,7 @@ def __call__(self, *args, **kwargs):
 
         return self._perform_call(*args, **kwargs)
 
-    def _is_train_mode(self):
+    def _is_train_mode(self) -> bool:
         if self._class_instance is not None:
             if not hasattr(self._class_instance, 'training'):
                 raise TypeError(
@@ -504,7 +512,9 @@ def _is_train_mode(self):
         else:
             return self._training
 
-    def _call_dygraph_function(self, *args, **kwargs):
+    def _call_dygraph_function(
+        self, *args: _InputT.args, **kwargs: _InputT.kwargs
+    ) -> _RetT:
         """
         Calls dygraph function directly and returns the outputs.
 
@@ -526,7 +536,9 @@ def _raise_when_property(self):
         if self.is_property:
             raise RuntimeError("Can not call the func when property=True.")
 
-    def get_concrete_program(self, *args, **kwargs):
+    def get_concrete_program(
+        self, *args: _InputT.args, **kwargs: _InputT.kwargs
+    ) -> tuple[ConcreteProgram, PartialProgramLayer | PirPartialProgramLayer]:
         raise NotImplementedError("Not implemented yet.")
 
     def get_concrete_program_with_cache_key(self, cached_key):
@@ -536,11 +548,11 @@ def get_traced_count(self):
         raise NotImplementedError("Not implemented yet.")
 
     @property
-    def code(self):
+    def code(self) -> str:
         raise NotImplementedError("Not implemented yet.")
 
     @property
-    def dygraph_function(self):
+    def dygraph_function(self) -> Callable[_InputT, _RetT]:
         """
         Returns the original decorated function.
         """
@@ -550,15 +562,18 @@ def dygraph_function(self):
             return self._dygraph_function
 
     @property
-    def concrete_program(self):
+    def concrete_program(self) -> ConcreteProgram:
         raise NotImplementedError("Not implemented yet.")
 
     def concrete_program_specify_input_spec(
-        self, input_spec=None, with_hook=False, is_prim_infer=False
+        self,
+        input_spec: NestedSequence[InputSpec] | None = None,
+        with_hook: bool = False,
+        is_prim_infer: bool = False,
     ):
         raise NotImplementedError("Not implemented yet.")
 
-    def rollback(self):
+    def rollback(self) -> Callable[_InputT, _RetT]:
         """
         Rollback into original dygraph functions for current class instance.
 
@@ -662,23 +677,23 @@ def __deepcopy__(self, memo):
             return self._dygraph_function
 
     @property
-    def inputs(self):
+    def inputs(self) -> list[Any]:
         raise NotImplementedError("Not implemented yet.")
 
     @property
-    def outputs(self):
+    def outputs(self) -> list[Any]:
         raise NotImplementedError("Not implemented yet.")
 
     @property
-    def main_program(self):
+    def main_program(self) -> Program:
         raise NotImplementedError("Not implemented yet.")
 
     @property
-    def program_cache(self):
+    def program_cache(self) -> ProgramCache:
         raise NotImplementedError("Not implemented yet.")
 
     @property
-    def function_spec(self):
+    def function_spec(self) -> FunctionSpec:
         raise NotImplementedError("Not implemented yet.")
 
 
@@ -762,10 +777,10 @@ def program_cache(self):
 
     @property
     def function_spec(self):
-        raise_error_template("function_spec ")()
+        raise_error_template("function_spec")()
 
 
-class ASTStaticFunction(StaticFunction):
+class ASTStaticFunction(StaticFunction[_InputT, _RetT]):
     """
     Wrapper class to Manage program conversion of decorated function.
 
@@ -812,7 +827,9 @@ def _perform_call(self, *args, **kwargs):
                 )
                 raise e
 
-    def get_concrete_program(self, *args, **kwargs):
+    def get_concrete_program(
+        self, *args: _InputT.args, **kwargs: _InputT.kwargs
+    ) -> tuple[ConcreteProgram, PartialProgramLayer | PirPartialProgramLayer]:
         """
         Returns traced concrete program and inner executable partial layer.
 
@@ -867,7 +884,9 @@ def get_concrete_program(self, *args, **kwargs):
         partial_program_layer._debug_name = self._debug_name
         return concrete_program, partial_program_layer
 
-    def get_concrete_program_with_cache_key(self, cached_key):
+    def get_concrete_program_with_cache_key(
+        self, cached_key: CacheKey
+    ) -> tuple[ConcreteProgram, PartialProgramLayer | PirPartialProgramLayer]:
         """
         Returns traced concrete program and inner executable partial layer by cached key.
 
@@ -884,14 +903,14 @@ def get_concrete_program_with_cache_key(self, cached_key):
         ) = self._program_cache.get_program_without_cache(cached_key)
         return concrete_program, partial_program_layer
 
-    def get_traced_count(self):
+    def get_traced_count(self) -> int:
         """
         Returns the number of traced programs for the decorated function.
         """
         return len(self._program_cache)
 
     @property
-    def code(self):
+    def code(self) -> str:
         """
         Returns the source code of transformed static function for debugging.
         """
@@ -900,7 +919,7 @@ def code(self):
         return source_code
 
     @property
-    def concrete_program(self):
+    def concrete_program(self) -> ConcreteProgram:
         """
         Returns recent ConcreteProgram instance of decorated function.
 
@@ -930,8 +949,11 @@ def concrete_program(self):
         return self.concrete_program_specify_input_spec(input_spec=None)
 
     def concrete_program_specify_input_spec(
-        self, input_spec=None, with_hook=False, is_prim_infer=False
-    ):
+        self,
+        input_spec: NestedSequence[InputSpec] | None = None,
+        with_hook: bool = False,
+        is_prim_infer: bool = False,
+    ) -> ConcreteProgram:
         """
         Returns recent ConcreteProgram instance of decorated function while
         specifying input_spec. If the self._function_spec already has
@@ -1006,7 +1028,7 @@ def concrete_program_specify_input_spec(
                 )
 
     @property
-    def inputs(self):
+    def inputs(self) -> list[Any]:
         """
         Returns input tensors of recent converted static program.
         """
@@ -1020,7 +1042,7 @@ def inputs(self):
         return inputs
 
     @property
-    def outputs(self):
+    def outputs(self) -> list[Any]:
         """
         Returns output tensors of recent converted static program.
         """
@@ -1035,7 +1057,7 @@ def outputs(self):
         return outputs
 
     @property
-    def main_program(self):
+    def main_program(self) -> Program:
         """
         Returns recent converted static main program.
         """
@@ -1045,11 +1067,11 @@ def main_program(self):
         return main_program
 
     @property
-    def program_cache(self):
+    def program_cache(self) -> ProgramCache:
         return self._program_cache
 
     @property
-    def function_spec(self):
+    def function_spec(self) -> FunctionSpec:
         return self._function_spec
 
 
@@ -1597,8 +1619,7 @@ def _build_once(self, cache_key):
     def __getitem__(self, item):
         if not isinstance(item, CacheKey):
             raise ValueError(
-                'type(item) should be CacheKey, but received %s'
-                % type_name(item)
+                f'type(item) should be CacheKey, but received {type_name(item)}'
             )
         item_id = hash(item)
         self._recent_cache_key = item
@@ -1621,8 +1642,7 @@ def get_program_without_cache(self, cache_key):
     def get_program(self, item):
         if not isinstance(item, CacheKey):
             raise ValueError(
-                "Input item's type should be FunctionSpec, but received %s"
-                % type_name(item)
+                f"Input item's type should be FunctionSpec, but received {type_name(item)}"
             )
         item_id = hash(item)
         if item_id not in self._caches:
@@ -1757,7 +1777,7 @@ def enable(self, enable_to_static):
         self.enable_to_static = enable_to_static
 
 
-def enable_to_static(enable_to_static_bool):
+def enable_to_static(enable_to_static_bool: bool) -> None:
     """
     Enable or disable the converting from imperative to static graph by
     ProgramTranslator globally.
diff --git a/python/paddle/jit/dy2static/transformers/early_return_transformer.py b/python/paddle/jit/dy2static/transformers/early_return_transformer.py
index 4dab1e5ab1638..ce8cf9e606878 100644
--- a/python/paddle/jit/dy2static/transformers/early_return_transformer.py
+++ b/python/paddle/jit/dy2static/transformers/early_return_transformer.py
@@ -36,9 +36,7 @@ def transform(self):
     def is_define_return_in_if(self, node):
         assert isinstance(
             node, gast.If
-        ), "Type of input node should be gast.If, but received %s ." % type(
-            node
-        )
+        ), f"Type of input node should be gast.If, but received {type(node)}."
         for child in node.body:
             if isinstance(child, gast.Return):
                 return True
diff --git a/python/paddle/jit/dy2static/utils.py b/python/paddle/jit/dy2static/utils.py
index ad195befba4b5..03a2cd06d3211 100644
--- a/python/paddle/jit/dy2static/utils.py
+++ b/python/paddle/jit/dy2static/utils.py
@@ -204,7 +204,7 @@ def make_hashable(x, error_msg=None):
             return tuple(map(make_hashable, x.values()))
 
         error_msg = error_msg or "Requires a hashable object."
-        raise ValueError(error_msg + " But received type: %s" % type_name(x))
+        raise ValueError(f"{error_msg} But received type: {type_name(x)}")
 
     return x
 
@@ -327,8 +327,7 @@ def func_prefix(func):
         callable_func = getattr(module, func_name)
     else:
         raise ValueError(
-            'Function: %s doesn\'t exist in the Module transformed from AST.'
-            % func_name
+            f'Function: {func_name} doesn\'t exist in the Module transformed from AST.'
         )
     # After transform dygraph function into callable_func saved in tmp file,
     # it lost the global variables from imported statements or defined in source file.
diff --git a/python/paddle/jit/pir_translated_layer.py b/python/paddle/jit/pir_translated_layer.py
index 8a6e3ede35e2a..df3217ceb07b3 100644
--- a/python/paddle/jit/pir_translated_layer.py
+++ b/python/paddle/jit/pir_translated_layer.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import os
 
 import numpy as np
@@ -217,7 +219,6 @@ def _load_pir_parameter_vars(model_path, program_holder, params_filename):
     # load all vars
     assert params_filename is not None, "params_filename should not be None."
     var_file_path = os.path.join(model_path, params_filename)
-
     if os.path.exists(var_file_path):
         core.load_combine_func(
             var_file_path,
@@ -228,8 +229,7 @@ def _load_pir_parameter_vars(model_path, program_holder, params_filename):
         )
     else:
         raise ValueError(
-            "The file %s does not exist. Please check the model path."
-            % var_file_path
+            f"The file {var_file_path} does not exist. Please check the model path."
         )
 
     load_var_dict.update(other_var_dict)
@@ -328,8 +328,7 @@ def _run_dygraph(instance, input, program_holder):
     for i, value in enumerate(input):
         if not isinstance(value, (np.ndarray, core.eager.Tensor)):
             raise TypeError(
-                "The type of input in PirTranslatedLayer must be numpy array or Variable(Tensor), but received %s."
-                % type(value)
+                f"The type of input in PirTranslatedLayer must be numpy array or Variable(Tensor), but received {type(value)}."
             )
         # NOTE: In order to unify the API, firstly convert the input to Tensor
         if isinstance(value, np.ndarray):
@@ -361,8 +360,7 @@ def _run_dygraph(instance, input, program_holder):
             persistable_tensors.append(instance._buffers[dy_var_name])
         else:
             raise ValueError(
-                "The persistable variable %s does not exist in current PirTranslatedLayer."
-                % var_name
+                f"The persistable variable {var_name} does not exist in current PirTranslatedLayer."
             )
 
     from paddle.jit.dy2static.pir_partial_program import PartialProgramLayer
@@ -378,7 +376,6 @@ def _run_dygraph(instance, input, program_holder):
         parameters,
     )
     instance.layer = layer
-
     if instance._is_test:
         layer.training = False
     else:
@@ -392,9 +389,42 @@ def _run_dygraph(instance, input, program_holder):
     return instance.layer(input_tensors)
 
 
-def _run_static_graph(program_holder, trace_program):
-    paddle.base.framework.switch_main_program(trace_program)
-    return program_holder.output_vars
+def _run_static_graph(inputs, program_holder, src_program):
+    '''
+    This function is used when the pirTranslatedLayer is
+    applied for dy_to_static conversion.
+    '''
+    dst_program = paddle.static.default_main_program()
+    value_map = paddle.pir.IrMapping()
+    # Establish a mapping relationship between existing parameters
+    # and corresponding parameters in the program to be copied
+    len_dst_op = len(dst_program.global_block().ops)
+    for dst_op in dst_program.global_block().ops:
+        if dst_op.name() == "builtin.parameter":
+            for src_op in src_program.global_block().ops[:len_dst_op]:
+                if (
+                    src_op.name() == dst_op.name()
+                    and src_op.result(0).name == dst_op.result(0).name
+                ):
+                    for i in range(src_op.num_results()):
+                        value_map.add(src_op.result(i), dst_op.result(i))
+    # Establish a mapping relationship between truly inputs
+    # and corresponding inputs in the program to be copied
+    src_inputs = program_holder.input_vars
+    if len(src_inputs) != len(inputs):
+        raise ValueError(
+            f"The number of input is invalid, expected {len(src_inputs)}, but received {len(inputs)}."
+        )
+    for src_input, input_ in zip(src_inputs, inputs):
+        value_map.add(src_input, input_)
+
+    # find the insert point for copy
+    current_insert_point = paddle.pir.get_current_insertion_point()
+    current_block = current_insert_point.block()
+    src_program.copy_to_block(value_map, current_block)
+
+    output = [value_map.look_up(v) for v in program_holder.output_vars]
+    return output[0] if len(output) == 1 else output
 
 
 def _collect_current_and_parent_var(program, block_idx):
@@ -514,7 +544,11 @@ class PirTranslatedLayer(layers.Layer):
 
     """
 
-    def __init__(self, programs, persistable_vars):
+    def __init__(
+        self,
+        programs: dict[str, paddle.static.Program],
+        persistable_vars: dict[str, paddle.Tensor],
+    ):
         super().__init__()
 
         if not isinstance(programs, dict):
@@ -561,7 +595,7 @@ def _construct(model_path, configs=None):
         # 0. dir and filename check
         model_path = os.path.normpath(model_path)
         if not os.path.isdir(model_path):
-            raise ValueError("There is no directory named '%s'" % model_path)
+            raise ValueError(f"There is no directory named '{model_path}'")
         model_filename = None
         params_filename = None
         if configs is not None:
@@ -608,7 +642,7 @@ def __i_m_p_l__(self, *input):
                 return _run_dygraph(self, input, program_holder)
             else:
                 return _run_static_graph(
-                    program_holder, program_holder.infer_program
+                    input, program_holder, program_holder.infer_program
                 )
 
         __i_m_p_l__.__name__ = method_name
@@ -719,8 +753,7 @@ def _get_program_holder(self, method_name='forward'):
         program_holder = self._program_holder_dict.get(method_name, None)
         if program_holder is None:
             raise ValueError(
-                "The method `%s` does not exist in loaded PirTranslatedLayer."
-                % method_name
+                f"The method `{method_name}` does not exist in loaded PirTranslatedLayer."
             )
         return program_holder
 
diff --git a/python/paddle/jit/sot/infer_meta.py b/python/paddle/jit/sot/infer_meta.py
index 3ec9f0d891c9e..a67b10c27105f 100644
--- a/python/paddle/jit/sot/infer_meta.py
+++ b/python/paddle/jit/sot/infer_meta.py
@@ -11,8 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
 
 from functools import cached_property
+from typing import TypeVar
 
 import paddle
 from paddle.amp.auto_cast import amp_state
@@ -26,10 +28,32 @@
 
 from .utils import Cache, Singleton, map_if_extend, meta_str
 
+DynamicSymbolT = TypeVar("DynamicSymbolT")
+
+
+class SymbolicInt(metaclass=Singleton):
+    def __eq__(self, other) -> bool:
+        return isinstance(other, (int, SymbolicInt))
+
+    def __repr__(self) -> str:
+        return "SymbolicInt()"
+
+    def __str__(self) -> str:
+        return "SymbolicInt()"
+
 
 class MetaInfo:
     def __init__(
-        self, shape, dtype, stop_gradient, name, persistable, type, place
+        self,
+        shape,
+        dtype,
+        stop_gradient,
+        name,
+        persistable,
+        type,
+        place,
+        *,
+        dynamic_axes: list[int] | None = None,
     ):
         self.name = name
         self.persistable = persistable
@@ -38,9 +62,18 @@ def __init__(
         self.shape = shape
         self.dtype = dtype
         self.stop_gradient = stop_gradient
+        self.dynamic_axes = dynamic_axes or []
+
+    def get_dynamic_shape(
+        self, dynamic_symbol: DynamicSymbolT = -1
+    ) -> list[int | DynamicSymbolT]:
+        return [
+            dim if i not in self.dynamic_axes else dynamic_symbol
+            for i, dim in enumerate(self.shape)
+        ]
 
     @staticmethod
-    def from_tensor(tensor):
+    def from_tensor(tensor, *, dynamic_axes: list[int] | None = None):
         if isinstance(tensor, paddle.pir.Value):
             name = "Value@NoName"
         else:  # For Tensor or Variable
@@ -54,6 +87,7 @@ def from_tensor(tensor):
         )
         assert isinstance(dtype, expected_dtype_class)
 
+        # TODO(@xiongkun) remove after pir become default state.
         # We always use float32 in simulation if AMP is enabled.
         current_amp_state = amp_state()
         if (
@@ -63,7 +97,12 @@ def from_tensor(tensor):
             and current_amp_state["dtype"] == "float16"
         ):
             dtype = paddle.float32
-        # TODO(@xiongkun) remove after pir become default state.
+        dynamic_axes = dynamic_axes or []
+        dynamic_axes = [
+            i
+            for i, dim in enumerate(tensor.shape)
+            if dim == -1 or i in dynamic_axes
+        ]
         return MetaInfo(
             list(tensor.shape),
             dtype,
@@ -72,6 +111,7 @@ def from_tensor(tensor):
             persistable,
             tensor.type,
             tensor.place,
+            dynamic_axes=dynamic_axes,
         )
 
     def is_dynamic_shape(self):
@@ -82,12 +122,14 @@ def is_dynamic_shape(self):
         return -1 in self.shape
 
     def to_input_spec(self):
+        shape = self.get_dynamic_shape(None)
         return paddle.static.InputSpec(
-            self.shape, dtype=self.dtype, stop_gradient=self.stop_gradient
+            shape, dtype=self.dtype, stop_gradient=self.stop_gradient
         )
 
     def guard_str(self):
-        return f"({self.shape}, {self.dtype}, {self.stop_gradient})"
+        shape = self.get_dynamic_shape(SymbolicInt())
+        return f"({shape}, {self.dtype}, {self.stop_gradient})"
 
     def __repr__(self):
         return meta_str(self.shape, self.dtype, self.stop_gradient)
@@ -161,20 +203,22 @@ def startup_program(self):
         else:
             return self.legacy_programs[1]
 
-    def create_var(self, meta):
+    def create_var(self, meta: MetaInfo):
+        shape = meta.get_dynamic_shape()
+
         if paddle.framework.use_pir_api():
             with paddle.static.program_guard(
                 self.main_program, self.startup_program
             ):
                 var = paddle.static.input.data(
                     name=self.gen_name(meta),
-                    shape=meta.shape,
+                    shape=shape,
                     dtype=convert_dtype(meta.dtype),
                 )
                 var.stop_gradient = meta.stop_gradient
         else:
             var = self.main_program.global_block().create_var(
-                shape=meta.shape,
+                shape=shape,
                 dtype=meta.dtype,
                 stop_gradient=meta.stop_gradient,
             )
@@ -193,9 +237,10 @@ def infer_meta(self, func, *args, **kwargs):
         with paddle.base.framework._dygraph_guard(None), UniqueNameGuard(
             self.var_name_generator
         ):
-            args, kwargs = convert_meta_to_variable(
-                args
-            ), convert_meta_to_variable(kwargs)
+            args, kwargs = (
+                convert_meta_to_variable(args),
+                convert_meta_to_variable(kwargs),
+            )
 
             with paddle.static.program_guard(
                 self.main_program, self.startup_program
@@ -225,9 +270,11 @@ def convert_meta_to_input_spec(args):
         pred=lambda x: isinstance(x, MetaInfo),
         true_fn=lambda x: x.to_input_spec(),
         # TODO(xiongkun): can x be tensor ?
-        false_fn=lambda x: paddle.static.InputSpec.from_tensor(x)
-        if isinstance(x, paddle.Tensor)
-        else x,
+        false_fn=lambda x: (
+            paddle.static.InputSpec.from_tensor(x)
+            if isinstance(x, paddle.Tensor)
+            else x
+        ),
     )
 
 
diff --git a/python/paddle/jit/sot/opcode_translator/executor/executor_cache.py b/python/paddle/jit/sot/opcode_translator/executor/executor_cache.py
index f94884d0c118b..bbefddda639ad 100644
--- a/python/paddle/jit/sot/opcode_translator/executor/executor_cache.py
+++ b/python/paddle/jit/sot/opcode_translator/executor/executor_cache.py
@@ -56,12 +56,16 @@ class OpcodeExecutorCache(metaclass=Singleton):
     MAX_CACHE_SIZE = 20
     cache: dict[types.CodeType, GuardedFunctions]
     translate_count: int
-    symbolic_inputs: dict[str, dict[int, int]]
+    code_symbolic_inputs: dict[types.CodeType, dict[str, dict[int, int]]]
 
     def __init__(self):
         self.cache = {}
         self.translate_count = 0
-        self.symbolic_inputs = {}
+        self.code_symbolic_inputs = {}
+
+    def get_symbolic_inputs(self, code: types.CodeType):
+        self.code_symbolic_inputs.setdefault(code, {})
+        return self.code_symbolic_inputs[code]
 
     def clear(self):
         """
@@ -69,6 +73,7 @@ def clear(self):
         """
         self.cache.clear()
         self.translate_count = 0
+        self.code_symbolic_inputs.clear()
 
     def __call__(self, frame: types.FrameType, **kwargs) -> CustomCode:
         code: types.CodeType = frame.f_code
diff --git a/python/paddle/jit/sot/opcode_translator/executor/function_graph.py b/python/paddle/jit/sot/opcode_translator/executor/function_graph.py
index 99ea75ebbcd48..93de3c8dfe815 100644
--- a/python/paddle/jit/sot/opcode_translator/executor/function_graph.py
+++ b/python/paddle/jit/sot/opcode_translator/executor/function_graph.py
@@ -22,9 +22,9 @@
 from collections import namedtuple
 from copy import deepcopy
 from functools import cached_property
-from typing import Any, Callable
+from typing import Any, Callable, Tuple, Union
 
-from typing_extensions import TypeGuard
+from typing_extensions import TypeAlias, TypeGuard
 
 import paddle
 from paddle.jit.utils import OrderedSet
@@ -37,7 +37,7 @@
     ast_infer_meta,
 )
 from ...profiler import EventGuard, event_register
-from ...symbolic.statement_ir import Reference, Symbol
+from ...symbolic.statement_ir import Reference, StatementIR, Symbol
 from ...symbolic.symbolic_context import SymbolicTraceContext
 from ...utils import (
     NameGenerator,
@@ -81,6 +81,15 @@
     map_variables,
 )
 
+CompileGraphResult: TypeAlias = Tuple[
+    Callable[..., Any],
+    Tuple[
+        StatementIR,
+        OrderedSet[Union[TensorVariable, SymbolicVariable]],
+        OrderedSet[Union[TensorVariable, SymbolicVariable]],
+    ],
+]
+
 
 def convert_to_meta(inputs: Any):
     """
@@ -329,7 +338,7 @@ def _restore_origin_opcode(self, stack_vars, store_var_info, instr_idx):
 
         self.pycode_gen.gen_enable_eval_frame()
 
-        name_gen = NameGenerator("__start_compile_saved_orig_")
+        name_gen = NameGenerator("___compile_fn_saved_orig_")
 
         # here is not update changed values, it just give names to stack vars
         # and want keep same interface as _build_compile_fn_with_name_store
@@ -344,13 +353,18 @@ def _restore_origin_opcode(self, stack_vars, store_var_info, instr_idx):
 
         return VariableLoader(store_var_info, self.pycode_gen)
 
-    def _build_compile_fn_with_name_store(self, to_store_vars, store_var_info):
+    def _build_compile_fn_with_name_store(
+        self,
+        compile_graph_result: CompileGraphResult,
+        to_store_vars,
+        store_var_info,
+    ):
         # var_id -> local_name mapping
         to_store_vars = list(
             filter(lambda x: not isinstance(x, NullVariable), to_store_vars)
         )
-        self.start_compile(*to_store_vars)
-        name_gen = NameGenerator("__start_compile_saved_")
+        self.compile_function(compile_graph_result, to_store_vars)
+        name_gen = NameGenerator("___compile_fn_saved_")
 
         for var in to_store_vars[::-1]:
             if store_var_info[var.id] is None:
@@ -363,23 +377,38 @@ def _build_compile_fn_with_name_store(self, to_store_vars, store_var_info):
 
         return VariableLoader(store_var_info, self.pycode_gen)
 
-    def get_compiled_fn(self, *ret_vars):
+    def compile_graph(self, *ret_vars: VariableBase) -> CompileGraphResult:
         ret_items = [
             ret_item
             for ret_var in ret_vars
             for ret_item in ret_var.flatten_items()
         ]
 
-        tensor_items = self._find_tensor_outputs(ret_items)
-        compiled_fn, _ = self.sir_ctx.compile_fn(
-            [Symbol(tensor_var.var_name) for tensor_var in tensor_items],
+        symbolic_outputs = self._find_tensor_outputs(ret_items)
+        statement_ir = self.sir_ctx.return_TOS(
+            [Symbol(tensor_var.var_name) for tensor_var in symbolic_outputs]
+        )
+        if not statement_ir.statements:
+            return self.sir_ctx.compile_do_nothing(), (
+                statement_ir,
+                OrderedSet(),
+                OrderedSet(),
+            )
+        input_names = statement_ir.inputs
+        symbolic_inputs = self._find_tensor_inputs(input_names)
+        compiled_fn = self.sir_ctx.compile_fn(
+            statement_ir.name,
+            [var.meta.to_input_spec() for var in symbolic_inputs],
             **self._kwargs,
         )
+        return compiled_fn, (statement_ir, symbolic_inputs, symbolic_outputs)
 
-        return compiled_fn
-
-    @event_register("start_compile", event_level=2)
-    def start_compile(self, *ret_vars: VariableBase):
+    @event_register("compile_function", event_level=2)
+    def compile_function(
+        self,
+        compile_graph_result: CompileGraphResult,
+        ret_vars: list[VariableBase],
+    ):
         """
         Generate bytecode based on the information collected by the simulation execution.
 
@@ -393,48 +422,24 @@ def start_compile(self, *ret_vars: VariableBase):
         """
         from ..breakpoint import BreakpointManager
 
-        BreakpointManager().on_event("start_compile")
-
-        ret_items = [
-            ret_item
-            for ret_var in ret_vars
-            for ret_item in ret_var.flatten_items()
-        ]
-
-        tensor_items = self._find_tensor_outputs(ret_items)
-        compiled_fn, statement_ir = self.sir_ctx.compile_fn(
-            [Symbol(tensor_var.var_name) for tensor_var in tensor_items],
-            **self._kwargs,
-        )
-        input_names = statement_ir.inputs
-        compiled_fn_name = f"__compiled_fn_{statement_ir.name}"
+        BreakpointManager().on_event("compile_function")
+        graph_fn, (
+            statement_ir,
+            symbolic_inputs,
+            symbolic_outputs,
+        ) = compile_graph_result
+        compiled_fn_name = f"___graph_fn_{statement_ir.name}"
         # prepare function and inputs
-        self.pycode_gen.gen_load_object(compiled_fn, compiled_fn_name)
-        for name in input_names:
-            found = False
-            for variable in self.input_variables:
-                if (
-                    isinstance(variable, (TensorVariable, SymbolicVariable))
-                    and variable.get_symbol().name == name
-                ):
-                    if isinstance(variable, SymbolicVariable):
-                        self.pycode_gen.gen_load_object(
-                            paddle.to_tensor, "___paddle_to_tensor"
-                        )
-                    variable.tracker.gen_instructions(self.pycode_gen)
-                    found = True
-                    if isinstance(variable, SymbolicVariable):
-                        self.pycode_gen.gen_call_function(1)
-                    break
-            assert found, f"can't find input {name} in SIR."
+        self.pycode_gen.gen_load_object(graph_fn, compiled_fn_name)
+        self.gen_load_inputs(symbolic_inputs)
         # Pack all args into a tuple, because we don't support *args now.
-        self.pycode_gen.gen_build_tuple(count=len(input_names))
-        # call the compiled_fn
+        self.pycode_gen.gen_build_tuple(count=len(symbolic_inputs))
+        # call the graph_fn
         self.pycode_gen.gen_call_function(argc=1)
 
         # Store outputs to f_locals
-        self.pycode_gen.gen_unpack_sequence(count=len(tensor_items))
-        for tensor_var in tensor_items:
+        self.pycode_gen.gen_unpack_sequence(count=len(symbolic_outputs))
+        for tensor_var in symbolic_outputs:
             self.pycode_gen.gen_store_fast(tensor_var.out_var_name)
         # restore the outputs.
         for ret_var in ret_vars:
@@ -725,6 +730,36 @@ def remove_global_guarded_variable(self, variable: VariableBase):
         if variable in self._global_guarded_variables:
             self._global_guarded_variables.remove(variable)
 
+    def _find_tensor_inputs(
+        self, input_names: list[str]
+    ) -> OrderedSet[TensorVariable | SymbolicVariable]:
+        inputs: OrderedSet[TensorVariable | SymbolicVariable] = OrderedSet()
+        for name in input_names:
+            found = False
+            for variable in self.input_variables:
+                if (
+                    isinstance(variable, (TensorVariable, SymbolicVariable))
+                    and variable.get_symbol().name == name
+                ):
+                    inputs.add(variable)
+                    found = True
+                    break
+            assert found, f"can't find input {name} in SIR."
+        assert len(inputs) == len(input_names), "Number of inputs not match."
+        return inputs
+
+    def gen_load_inputs(
+        self, inputs: OrderedSet[TensorVariable | SymbolicVariable]
+    ):
+        for input_var in inputs:
+            if isinstance(input_var, SymbolicVariable):
+                self.pycode_gen.gen_load_object(
+                    paddle.to_tensor, "___paddle_to_tensor"
+                )
+            input_var.tracker.gen_instructions(self.pycode_gen)
+            if isinstance(input_var, SymbolicVariable):
+                self.pycode_gen.gen_call_function(1)
+
     def _find_tensor_outputs(
         self, outputs: list[VariableBase]
     ) -> OrderedSet[TensorVariable | SymbolicVariable]:
@@ -738,12 +773,14 @@ def _find_tensor_outputs(
         def is_graph_output(
             var,
         ) -> TypeGuard[TensorVariable | SymbolicVariable]:
-            return isinstance(var.tracker, DummyTracker) and isinstance(
-                var, (TensorVariable, SymbolicVariable)
-            )
+            return isinstance(
+                var.tracker, (DummyTracker, SymbolicOperationTracker)
+            ) and isinstance(var, (TensorVariable, SymbolicVariable))
 
         def collect_related_dummy_tensor(var):
-            if isinstance(var.tracker, DummyTracker):
+            if isinstance(
+                var.tracker, (DummyTracker, SymbolicOperationTracker)
+            ):
                 if is_graph_output(var):
                     return [var]
                 else:
@@ -758,7 +795,9 @@ def collect_related_dummy_tensor(var):
         ] = OrderedSet()
         # Find Tensor Variables from outputs.
         for output in outputs:
-            if isinstance(output.tracker, DummyTracker):
+            if isinstance(
+                output.tracker, (DummyTracker, SymbolicOperationTracker)
+            ):
                 if is_graph_output(output):
                     output_tensors.add(output)
                 else:
@@ -809,7 +848,7 @@ def restore_print_stmts(self, variables: list[VariableBase]):
                 add_to_global_guarded_vars=False,
             )
 
-    def restore_inplace_tensor(self, variables: list[VariableBase]):
+    def restore_inplace_tensor(self, variables: OrderedSet[VariableBase]):
         for var in variables:
             if not var.tracker.is_traceable():
                 continue
diff --git a/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py b/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py
index 70870913a6a02..3146609a595b0 100644
--- a/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py
+++ b/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py
@@ -1737,11 +1737,12 @@ def RETURN_CONST(self, instr: Instruction):
         return self.compile_return(ret_const)
 
     def compile_return(self, ret_val):
-        compile_fn = self._graph.get_compiled_fn(ret_val)
-        if compile_fn.graph_size() < ENV_MIN_GRAPH_SIZE.get():
+        compile_graph_result = self._graph.compile_graph(ret_val)
+        graph_fn, _ = compile_graph_result
+        if graph_fn.graph_size() < ENV_MIN_GRAPH_SIZE.get():
             self.new_code = None
         else:
-            self._graph.start_compile(ret_val)
+            self._graph.compile_function(compile_graph_result, [ret_val])
             self._graph.pycode_gen.gen_return()
             self.new_code = self._graph.pycode_gen.gen_pycode()
         self.guard_fn = self._graph.guard_fn
@@ -1775,15 +1776,16 @@ def get_compute_fn_and_update_changed_vars(
                 store_vars.append(_var)
             store_var_info[_var.id] = name
 
-        compile_fn = self._graph.get_compiled_fn(*store_vars)
+        compile_graph_result = self._graph.compile_graph(*store_vars)
+        graph_fn, _ = compile_graph_result
 
-        if compile_fn.graph_size() < ENV_MIN_GRAPH_SIZE.get():
+        if graph_fn.graph_size() < ENV_MIN_GRAPH_SIZE.get():
             return self._graph._restore_origin_opcode(
                 list(stack), store_var_info, end_idx
             )
         else:
             return self._graph._build_compile_fn_with_name_store(
-                store_vars, store_var_info
+                compile_graph_result, store_vars, store_var_info
             )
 
     @fallback_when_occur_error
diff --git a/python/paddle/jit/sot/opcode_translator/executor/tracker.py b/python/paddle/jit/sot/opcode_translator/executor/tracker.py
index 41ce17dba7cbc..85a7f68f6847a 100644
--- a/python/paddle/jit/sot/opcode_translator/executor/tracker.py
+++ b/python/paddle/jit/sot/opcode_translator/executor/tracker.py
@@ -127,7 +127,7 @@ def need_guard(self) -> bool:
         return False
 
 
-class SymbolicOperationTracker(DummyTracker):
+class SymbolicOperationTracker(Tracker):
     """
     SymbolicOperationTracker is a subclass of Tracker that specifically tracks variables cannot be reproduced from the frame.
     It is mostly generated by complex operations of symbolic variables.
@@ -151,6 +151,14 @@ def trace_value_from_frame(self):
     def __repr__(self) -> str:
         return f"SymbolicOperationTracker(num_inputs={len(self.inputs)})"
 
+    def is_traceable(self):
+        # TODO(zrr1999): to implement gen_instructions and trace_value_from_frame
+        return False
+
+    def need_guard(self) -> bool:
+        # TODO(zrr1999): to implement gen_instructions and trace_value_from_frame
+        return False
+
 
 class DanglingTracker(Tracker):
     """
diff --git a/python/paddle/jit/sot/opcode_translator/executor/variables/basic.py b/python/paddle/jit/sot/opcode_translator/executor/variables/basic.py
index 965b7edba28ed..ffec4b1485cb6 100644
--- a/python/paddle/jit/sot/opcode_translator/executor/variables/basic.py
+++ b/python/paddle/jit/sot/opcode_translator/executor/variables/basic.py
@@ -90,6 +90,8 @@
     core.DataType.BOOL: "bool",
 }
 
+STATIC_DIM_FREQ_THRESHOLD = 5
+
 
 class ConstantVariable(VariableBase):
     """
@@ -174,24 +176,6 @@ def chr(self):
             DummyTracker([self]),
         )
 
-    @check_guard
-    def make_stringify_guard(self) -> list[StringifyExpression]:
-        if (
-            ENV_SOT_ALLOW_DYNAMIC_SHAPE.get()
-            and isinstance(self.value, int)
-            and self.tracker.need_guard()
-        ):
-            from ..executor_cache import OpcodeExecutorCache
-
-            frame_value_tracer = self.tracker.trace_value_from_frame()
-            symbolic_inputs = OpcodeExecutorCache().symbolic_inputs
-            symbolic_inputs.setdefault(frame_value_tracer.inlined_expr, {})
-            symbolic_input = symbolic_inputs[frame_value_tracer.inlined_expr]
-            symbolic_input.setdefault(self.value, 0)
-            symbolic_input[self.value] += 1
-
-        return super().make_stringify_guard()
-
     @VariableFactory.register_from_value()
     def from_value(value: Any, graph: FunctionGraph, tracker: Tracker):
         if type(value) in ConstTypes:
@@ -349,10 +333,24 @@ def __init__(
             raise InnerError(
                 f"Required type(tensor) is paddle.Tensor or ProxyTensor, but received {type(tensor).__name__}."
             )
+        dynamic_axes: list[int] = []
+        if ENV_SOT_ALLOW_DYNAMIC_SHAPE.get() and self.tracker.is_traceable():
+            dynamic_axes = self.analyse_dynamic_axes()
+        self.meta.dynamic_axes = dynamic_axes
         self.origin_meta = self.meta
         self.var_name = TensorVariable.var_name_generator.next()
         self.graph.side_effects.record_mutable_variable(self)
 
+    def analyse_dynamic_axes(self):
+        shape_dims = (
+            self.shape.proxy.get_all()
+        )  # Trigger convert all shape dims to Variable
+        return [
+            i
+            for i, dim in enumerate(shape_dims)
+            if isinstance(dim, SymbolicVariable)
+        ]
+
     def __len__(self):
         if self.meta.shape[0] == -1:
             raise BreakGraphError(
@@ -399,9 +397,13 @@ def _reconstruct(self, codegen: PyCodeGen):
     def make_stringify_guard(self) -> list[StringifyExpression]:
         frame_value_tracer = self.tracker.trace_value_from_frame()
 
+        if ENV_SOT_ALLOW_DYNAMIC_SHAPE.get():
+            str_left_expr = f"MetaInfo.from_tensor({{}}, dynamic_axes={self.meta.dynamic_axes}).guard_str()"
+        else:
+            str_left_expr = "MetaInfo.from_tensor({}).guard_str()"
         return [
             StringifyExpression(
-                f"MetaInfo.from_tensor({{}}).guard_str() == '{self.origin_meta.guard_str()}'",
+                f"{str_left_expr} == '{self.origin_meta.guard_str()}'",
                 [frame_value_tracer],
                 union_free_vars(
                     {"MetaInfo": MetaInfo},
@@ -483,15 +485,15 @@ def size(self):
 
     @tensor_property
     def shape(self):
+        # TODO(zrr1999): support more tensor properties
         if self.meta.is_dynamic_shape():
             raise BreakGraphError(
                 f"Getting shape for a dynamic shape tensor causes graph break. shape = {self.meta.shape}"
             )
         from .container import ListVariable
 
-        return ListVariable(
-            self.meta.shape, self.graph, tracker=DummyTracker([self])
-        )
+        tracker = GetAttrTracker(self, "shape")
+        return ListVariable(self.meta.shape, self.graph, tracker=tracker)
 
     def numel(self):
         return self.size
@@ -605,7 +607,7 @@ class SymbolicVariable(VariableBase):
 
     def __init__(
         self,
-        value: int | MetaInfo,
+        value: int | None | MetaInfo,
         graph: FunctionGraph,
         tracker: Tracker,
     ):
@@ -663,7 +665,9 @@ def make_stringify_guard(self) -> list[StringifyExpression]:
         from ..executor_cache import OpcodeExecutorCache
 
         frame_value_tracer = self.tracker.trace_value_from_frame()
-        symbolic_inputs = OpcodeExecutorCache().symbolic_inputs
+        symbolic_inputs = OpcodeExecutorCache().get_symbolic_inputs(
+            self.graph.pycode_gen._origin_code
+        )
 
         assert frame_value_tracer.inlined_expr in symbolic_inputs
 
@@ -681,25 +685,42 @@ def make_stringify_guard(self) -> list[StringifyExpression]:
             )
         ]
 
+    @staticmethod
+    def should_create_symbolic_variable(
+        value: Any, tracker: Tracker, symbolic_inputs: dict[str, dict[int, int]]
+    ):
+        tracker_expr = tracker.trace_value_from_frame().inlined_expr
+        symbolic_inputs.setdefault(tracker_expr, {})
+        for expr, symbolic_input in symbolic_inputs.items():
+            if tracker.match_expr(expr):
+                symbolic_input.setdefault(value, 0)
+                symbolic_input[value] += 1
+                if symbolic_input[value] >= STATIC_DIM_FREQ_THRESHOLD:
+                    return False
+                if len(symbolic_input.keys()) > 1:
+                    return True
+                return False
+        return False
+
     @VariableFactory.register_from_value(successor="ConstantVariable")
     def from_value(value: Any, graph: FunctionGraph, tracker: Tracker):
         if not ENV_SOT_ALLOW_DYNAMIC_SHAPE.get():
-            return
+            return None
         if not isinstance(value, int):
-            return
-        if not tracker.need_guard():
-            return
+            return None
+        if not tracker.is_traceable():
+            return None
 
         from ..executor_cache import OpcodeExecutorCache
 
-        symbolic_inputs = OpcodeExecutorCache().symbolic_inputs
+        symbolic_inputs = OpcodeExecutorCache().get_symbolic_inputs(
+            graph.pycode_gen._origin_code
+        )
 
-        for tracker_expr, symbolic_input in symbolic_inputs.items():
-            if tracker.match_expr(tracker_expr):
-                symbolic_input.setdefault(value, 0)
-                symbolic_input[value] += 1
-                # TODO(zrr1999): determine frequency
-                return SymbolicVariable(value, graph, tracker)
+        if SymbolicVariable.should_create_symbolic_variable(
+            value, tracker, symbolic_inputs
+        ):
+            return SymbolicVariable(value, graph, tracker)
         return None
 
 
diff --git a/python/paddle/jit/sot/symbolic/compile_cache.py b/python/paddle/jit/sot/symbolic/compile_cache.py
index b697e721532f9..5cb06059bb3db 100644
--- a/python/paddle/jit/sot/symbolic/compile_cache.py
+++ b/python/paddle/jit/sot/symbolic/compile_cache.py
@@ -21,6 +21,7 @@
 from paddle.amp.auto_cast import amp_state
 from paddle.base.data_feeder import convert_dtype
 from paddle.framework import _dygraph_tracer, use_pir_api
+from paddle.static import InputSpec
 
 from ..infer_meta import convert_meta_to_input_spec
 from ..profiler import EventGuard
@@ -162,7 +163,13 @@ class CompileSIRCache(Cache, metaclass=Singleton):
     def __init__(self):
         super().__init__(weak=False)
 
-    def key_fn(self, context: SymbolicTraceContext, sir_name: str, **kwargs):
+    def key_fn(
+        self,
+        context: SymbolicTraceContext,
+        sir_name: str,
+        input_spec: list[InputSpec],
+        **kwargs,
+    ):
         """
         generate a hash key for a SIR
 
@@ -176,10 +183,16 @@ def key_fn(self, context: SymbolicTraceContext, sir_name: str, **kwargs):
         """
         sir = context.get_sir(sir_name)
         # NOTE(dev): Is str(sir) a heavy operation ?
-        hash_key = hash((str(sir), kwargs['training']))
+        hash_key = hash((str(sir), *input_spec, kwargs['training']))
         return hash_key
 
-    def value_fn(self, context: SymbolicTraceContext, sir_name: str, **kwargs):
+    def value_fn(
+        self,
+        context: SymbolicTraceContext,
+        sir_name: str,
+        input_spec: list[InputSpec],
+        **kwargs,
+    ):
         """
         Generate static graph function
 
@@ -196,6 +209,7 @@ def value_fn(self, context: SymbolicTraceContext, sir_name: str, **kwargs):
         return FallbackWrapper(
             paddle.jit.to_static(
                 compile_sir(context, sir_name),
+                input_spec=[input_spec],
                 build_strategy=build_strategy,
                 backend=backend,
                 full_graph=True,
diff --git a/python/paddle/jit/sot/symbolic/symbolic_context.py b/python/paddle/jit/sot/symbolic/symbolic_context.py
index cc6487f696d0a..4efe3038c2781 100644
--- a/python/paddle/jit/sot/symbolic/symbolic_context.py
+++ b/python/paddle/jit/sot/symbolic/symbolic_context.py
@@ -14,6 +14,10 @@
 
 from __future__ import annotations
 
+from typing import Any, Callable
+
+from paddle.static import InputSpec
+
 from ..utils import log
 from .compile_cache import CompileSIRCache
 from .statement_ir import (
@@ -126,7 +130,15 @@ def replace_TOS(self, sir):
         self.sir_stack.append(sir)
         self.statement_factory.update(sir)
 
-    def compile_do_nothing(self, ret_vals):
+    def return_TOS(self, ret_vals):
+        cur_sir: StatementIR = self.TOS
+        cur_sir.inputs = cur_sir.analyse_inputs()
+        cur_sir.outputs = ret_vals
+        log(2, "start subgraph compile and execution.\n")
+        log(2, self.TOS, "\n")
+        return cur_sir
+
+    def compile_do_nothing(self) -> Callable[[...], Any]:
         """
         Return a dummy function, which will return an empty list.
 
@@ -141,29 +153,12 @@ def __call__(*args, **kwargs):
             def graph_size(self):
                 return 0
 
-        # return None function
-        dummy_stmt_ir = StatementIR("dummy_func")
-        dummy_stmt_ir.outputs = []
-        dummy_stmt_ir.inputs = []
-        return DummyFunc(), dummy_stmt_ir
+        return DummyFunc()
 
-    def compile_fn(self, ret_vals, **kwargs):
+    def compile_fn(self, sir_name: str, input_spec: list[InputSpec], **kwargs):
         """
         start compile and return the python function, which must can be to_static without errors.
         """
-        cur_sir: StatementIR = self.TOS
-        # step0: if no statement, return a dummy function
-        if len(cur_sir.statements) == 0:
-            return self.compile_do_nothing(ret_vals)
-        # step1: analyse sir inputs and outputs
-        cur_sir.inputs = cur_sir.analyse_inputs()
-        # TODO: output analysis
-        cur_sir.outputs = ret_vals
-        log(2, "start subgraph compile and execution.\n")
-        log(2, self.TOS, "\n")
-        # step2: call compile_sir and get python function, third cache is triggered here.
-        static_func = CompileSIRCache()(self, cur_sir.name, **kwargs)
-        # step3: GC and reset TOS
-        # self.reset_TOS()
+        static_func = CompileSIRCache()(self, sir_name, input_spec, **kwargs)
 
-        return static_func, cur_sir
+        return static_func
diff --git a/python/paddle/jit/translated_layer.py b/python/paddle/jit/translated_layer.py
index ddf0cf9c8b02e..c281e335efb3d 100644
--- a/python/paddle/jit/translated_layer.py
+++ b/python/paddle/jit/translated_layer.py
@@ -892,8 +892,7 @@ def _run_dygraph(instance, input, program_holder):
     for i, value in enumerate(input):
         if not isinstance(value, (np.ndarray, core.eager.Tensor)):
             raise TypeError(
-                "The type of input in TranslatedLayer must be numpy array or Variable(Tensor), but received %s."
-                % type(value)
+                f"The type of input in TranslatedLayer must be numpy array or Variable(Tensor), but received {type(value)}."
             )
         # NOTE: In order to unify the API, firstly convert the input to Tensor
         if isinstance(value, np.ndarray):
@@ -925,8 +924,7 @@ def _run_dygraph(instance, input, program_holder):
             persistable_vars.append(instance._buffers[dy_var_name])
         else:
             raise ValueError(
-                "The persistable variable %s does not exist in current TranslatedLayer."
-                % var_name
+                f"The persistable variable {var_name} does not exist in current TranslatedLayer."
             )
 
     output_vars = []
@@ -1426,7 +1424,7 @@ def _construct(model_path, configs=None):
         # 0. dir and filename check
         model_path = os.path.normpath(model_path)
         if not os.path.isdir(model_path):
-            raise ValueError("There is no directory named '%s'" % model_path)
+            raise ValueError(f"There is no directory named '{model_path}'")
         model_filename = None
         params_filename = None
         if configs is not None:
@@ -1591,8 +1589,7 @@ def _get_program_holder(self, method_name='forward'):
         program_holder = self._program_holder_dict.get(method_name, None)
         if program_holder is None:
             raise ValueError(
-                "The method `%s` does not exist in loaded TranslatedLayer."
-                % method_name
+                f"The method `{method_name}` does not exist in loaded TranslatedLayer."
             )
         return program_holder
 
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index a9d8312bb4ca0..bf87c3fc0f0a8 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -69,7 +69,9 @@
     Upsample,
     UpsamplingBilinear2D,
     UpsamplingNearest2D,
+    ZeroPad1D,
     ZeroPad2D,
+    ZeroPad3D,
 )
 
 # TODO: import all neural network related api under this directory,
@@ -135,6 +137,8 @@
     AvgPool3D,
     FractionalMaxPool2D,
     FractionalMaxPool3D,
+    LPPool1D,
+    LPPool2D,
     MaxPool1D,
     MaxPool2D,
     MaxPool3D,
@@ -300,4 +304,8 @@
     'Unflatten',
     'FractionalMaxPool2D',
     'FractionalMaxPool3D',
+    'LPPool1D',
+    'LPPool2D',
+    'ZeroPad1D',
+    'ZeroPad3D',
 ]
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index 4543d5c8ca14d..bc0f0e1d2c388 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -144,6 +144,8 @@
     avg_pool3d,
     fractional_max_pool2d,
     fractional_max_pool3d,
+    lp_pool1d,
+    lp_pool2d,
     max_pool1d,
     max_pool2d,
     max_pool3d,
@@ -220,6 +222,8 @@
     'avg_pool1d',
     'avg_pool2d',
     'avg_pool3d',
+    'lp_pool1d',
+    'lp_pool2d',
     'max_pool1d',
     'max_pool2d',
     'max_pool3d',
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index 3dd30afeec986..ddfb04d8530a1 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -1543,7 +1543,7 @@ def tanhshrink(x, name=None):
         return out
 
 
-def thresholded_relu(x, threshold=1.0, name=None):
+def thresholded_relu(x, threshold=1.0, value=0.0, name=None):
     r"""
     thresholded relu activation.
 
@@ -1553,7 +1553,7 @@ def thresholded_relu(x, threshold=1.0, name=None):
             \left\{
                 \begin{array}{rl}
                 x,& \text{if } \ x > threshold \\
-                0,& \text{otherwise}
+                value,& \text{otherwise}
                 \end{array}
             \right.
 
@@ -1561,6 +1561,7 @@ def thresholded_relu(x, threshold=1.0, name=None):
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
         threshold (float, optional): The value of threshold for thresholded_relu. Default is 1.0
+        value (float, optional): The value to replace with when x is less than threshold. Default is 0.0
         name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
@@ -1580,7 +1581,7 @@ def thresholded_relu(x, threshold=1.0, name=None):
     """
 
     if in_dynamic_or_pir_mode():
-        return _C_ops.thresholded_relu(x, threshold)
+        return _C_ops.thresholded_relu(x, threshold, value)
     else:
         check_variable_and_dtype(
             x,
@@ -1594,19 +1595,19 @@ def thresholded_relu(x, threshold=1.0, name=None):
             type='thresholded_relu',
             inputs={'X': x},
             outputs={'Out': out},
-            attrs={'threshold': threshold},
+            attrs={'threshold': threshold, 'value': value},
         )
         return out
 
 
 @inplace_apis_in_dygraph_only
-def thresholded_relu_(x, threshold=1.0, name=None):
+def thresholded_relu_(x, threshold=1.0, value=0.0, name=None):
     r"""
     Inplace version of ``thresholded_relu`` API, the output Tensor will be inplaced with input ``x``.
     Please refer to :ref:`api_paddle_nn_functional_thresholded_relu`.
     """
     if in_dynamic_mode():
-        return _C_ops.thresholded_relu_(x, threshold)
+        return _C_ops.thresholded_relu_(x, threshold, value)
 
 
 def log_softmax(x, axis=-1, dtype=None, name=None):
diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index c9272e3a9c05e..3703abd739d57 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -2343,3 +2343,260 @@ def fractional_max_pool3d(
         )
 
         return (pool_out, mask) if return_mask else pool_out
+
+
+def lp_pool1d(
+    x,
+    norm_type,
+    kernel_size,
+    stride=None,
+    padding=0,
+    ceil_mode=False,
+    data_format="NCL",
+    name=None,
+):
+    """
+    This API implements power-average pooling 1d operation.
+    See more details in :ref:`api_paddle_nn_LPPool1d` .
+
+    Args:
+        x (Tensor): The input tensor of pooling operator which is a 3-D tensor with
+                          shape [N, C, L]. where `N` is batch size, `C` is the number of channels,
+                          `L` is the length of the feature. The data type is float16, float32 or float64.
+        norm_type (int|float): The number the power operation.
+        kernel_size (int|list|tuple): The pool kernel size. If it is a tuple or list,
+            it must contain two integers, (kernel_size_Height, kernel_size_Width).
+            Otherwise, the pool kernel size will be a square of an int.
+        stride (int|list|tuple): The stride size. If it is a tuple or list,
+            it must contain two integers, (stride_Height, stride_Width).
+            Otherwise, the stride size will be a square of an int.
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An int, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 2, [pad_height, pad_weight] whose value means the padding size of each dimension.
+            4. A list[int] or tuple(int) whose length is 4. [pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
+        ceil_mode(bool, optional): When True, it will use `ceil` instead of `floor` to compute the output shape. Default: False.
+        data_format(str, optional): The data format of the input and output data. An optional string from: `"NCL"`,
+            `"NLC"`. When it is `"NCL"`, the data is stored in the order of:
+            `[batch_size, input_channels, input_length]`. Default:`"NCL"`.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+    Returns:
+        Tensor: The output tensor of pooling result. The data type is same as input tensor.
+
+    Examples:
+        .. code-block:: python
+
+            >>> import paddle
+            >>> import paddle.nn as nn
+
+            >>> data = paddle.uniform([1, 3, 32], paddle.float32)
+            >>> LPPool1D = nn.LPPool1D(norm_type=3, kernel_size=2, stride=2, padding=0)
+            >>> pool_out = LPPool1D(data)
+            >>> print(pool_out.shape)
+            [1, 3, 16]
+    """
+    # NCL to NCHW
+    ori_data_format = data_format
+    if data_format == "NCL":
+        data_format = "NCHW"
+        axis = 2
+    else:
+        data_format = "NHWC"
+        axis = 1
+
+    if not in_dynamic_mode():
+        check_variable_and_dtype(
+            x, 'x', ['float16', 'float32', 'float64'], 'lp_pool1d'
+        )
+    _check_input(x, 3)
+    x = unsqueeze(x, [axis])
+    kernel_size = convert_to_list(kernel_size, 1, 'kernel_size')
+    kernel_size = [1] + kernel_size
+    if stride is None:
+        stride = kernel_size
+    else:
+        stride = convert_to_list(stride, 1, 'pool_stride')
+        stride = [1] + stride
+
+    _check_value_limitation(kernel_size, "kernel_size", min_limit=1e-3)
+    _check_value_limitation(stride, "stride", min_limit=1e-3)
+
+    channel_last = _channel_last(ori_data_format, 1)
+    padding, padding_algorithm = _update_padding_nd(
+        padding, 1, channel_last=channel_last, ceil_mode=ceil_mode
+    )
+
+    # use 2d to implement 1d should expand padding in advance.
+    padding = _expand_low_nd_padding(padding)
+
+    if in_dynamic_or_pir_mode():
+        output = _C_ops.lp_pool2d(
+            x,
+            kernel_size,
+            stride,
+            padding,
+            ceil_mode,
+            True,
+            data_format,
+            'lp',
+            False,
+            False,
+            padding_algorithm,
+            norm_type,
+        )
+        return squeeze(output, [axis])
+
+    else:
+        op_type = 'lp_pool2d'
+        helper = LayerHelper(op_type, **locals())
+        dtype = helper.input_dtype(input_param_name='x')
+        pool_out = helper.create_variable_for_type_inference(dtype)
+
+        helper.append_op(
+            type=op_type,
+            inputs={"x": x},
+            outputs={"out": pool_out},
+            attrs={
+                "pooling_type": "lp",
+                "kernel_size": kernel_size,
+                "global_pooling": False,
+                "strides": stride,
+                "paddings": padding,
+                "padding_algorithm": padding_algorithm,
+                "ceil_mode": ceil_mode,
+                "exclusive": True,
+                "data_format": data_format,
+                "norm_type": norm_type,
+            },
+        )
+        return squeeze(pool_out, [axis])
+
+
+def lp_pool2d(
+    x,
+    norm_type,
+    kernel_size,
+    stride=None,
+    padding=0,
+    ceil_mode=False,
+    data_format="NCHW",
+    name=None,
+):
+    """
+    This API implements power-average pooling 2d operation.
+    See more details in :ref:`api_paddle_nn_LPPool2d` .
+
+    Args:
+        x (Tensor): The input tensor of pooling operator which is a 4-D tensor with
+                          shape [N, C, H, W]. The format of input tensor is `"NCHW"` or
+                          `"NHWC"`, where `N` is batch size, `C` is the number of channels,
+                          `H` is the height of the feature, and `W` is the width of the
+                          feature. The data type if float32 or float64.
+        norm_type (int|float): The number the power operation.
+        kernel_size (int|list|tuple): The pool kernel size. If it is a tuple or list,
+            it must contain two integers, (kernel_size_Height, kernel_size_Width).
+            Otherwise, the pool kernel size will be a square of an int.
+        stride (int|list|tuple): The stride size. If it is a tuple or list,
+            it must contain two integers, (stride_Height, stride_Width).
+            Otherwise, the stride size will be a square of an int.
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An int, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 2, [pad_height, pad_weight] whose value means the padding size of each dimension.
+            4. A list[int] or tuple(int) whose length is 4. [pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
+        ceil_mode(bool, optional): When True, it will use `ceil` instead of `floor` to compute the output shape. Default: False.
+        data_format (string, optional): The data format of the input and output data. An optional string from: `"NCHW"`, `"NHWC"`.
+                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_height, input_width]`. Default: "NCHW".
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns:
+        Tensor: The output tensor of pooling result. The data type is same as input tensor.
+
+    Examples:
+        .. code-block:: python
+
+            >>> import paddle
+            >>> import paddle.nn.functional as F
+
+            >>> # lp pool2d
+            >>> x = paddle.uniform([1, 3, 32, 32], paddle.float32)
+            >>> out = F.lp_pool2d(x,
+            ...                   norm_type=2,
+            ...                   kernel_size=2,
+            ...                   stride=2, padding=0)
+            >>> print(out.shape)
+            [1, 3, 16, 16]
+    """
+
+    _check_input(x, 4)
+    if norm_type == 0:
+        raise ValueError("`norm_type` cannot be 0.")
+
+    norm_type = float(norm_type)
+    kernel_size = convert_to_list(kernel_size, 2, 'pool_size')
+    if stride is None:
+        stride = kernel_size
+    else:
+        stride = convert_to_list(stride, 2, 'pool_stride')
+
+    _check_value_limitation(kernel_size, "kernel_size", min_limit=1e-3)
+    _check_value_limitation(stride, "stride", min_limit=1e-3)
+
+    channel_last = _channel_last(data_format, 2)
+    padding, padding_algorithm = _update_padding_nd(
+        padding, 2, channel_last, ceil_mode=ceil_mode
+    )
+
+    if in_dynamic_or_pir_mode():
+        output = _C_ops.lp_pool2d(
+            x,
+            kernel_size,
+            stride,
+            padding,
+            ceil_mode,
+            True,
+            data_format,
+            'lp',
+            False,
+            False,
+            padding_algorithm,
+            norm_type,
+        )
+        return output
+    else:
+        op_type = 'lp_pool2d'
+        helper = LayerHelper(op_type, **locals())
+        check_variable_and_dtype(
+            x, 'x', ['float16', 'uint16', 'float32', 'float64'], 'lp_pool2d'
+        )
+        dtype = helper.input_dtype(input_param_name='x')
+        pool_out = helper.create_variable_for_type_inference(dtype)
+
+        helper.append_op(
+            type=op_type,
+            inputs={"x": x},
+            outputs={"out": pool_out},
+            attrs={
+                "pooling_type": "lp",
+                "kernel_size": kernel_size,
+                "global_pooling": False,
+                "strides": stride,
+                "paddings": padding,
+                "padding_algorithm": padding_algorithm,
+                "ceil_mode": ceil_mode,
+                "exclusive": True,
+                "data_format": data_format,
+                "norm_type": norm_type,
+            },
+        )
+
+        return pool_out
diff --git a/python/paddle/nn/initializer/__init__.py b/python/paddle/nn/initializer/__init__.py
index e281d6cd48589..270f0bb9234ea 100644
--- a/python/paddle/nn/initializer/__init__.py
+++ b/python/paddle/nn/initializer/__init__.py
@@ -18,7 +18,7 @@
     Assign,
     NumpyArrayInitializer,  # noqa: F401
 )
-from .Bilinear import Bilinear
+from .bilinear import Bilinear
 from .constant import (
     Constant,
     ConstantInitializer,  # noqa: F401
diff --git a/python/paddle/nn/initializer/Bilinear.py b/python/paddle/nn/initializer/bilinear.py
similarity index 100%
rename from python/paddle/nn/initializer/Bilinear.py
rename to python/paddle/nn/initializer/bilinear.py
diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py
index 27d5cd4ecefa4..80751daecd0e7 100644
--- a/python/paddle/nn/layer/__init__.py
+++ b/python/paddle/nn/layer/__init__.py
@@ -103,6 +103,8 @@
     AvgPool3D,
     FractionalMaxPool2D,
     FractionalMaxPool3D,
+    LPPool1D,
+    LPPool2D,
     MaxPool1D,
     MaxPool2D,
     MaxPool3D,
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index c1234c28bc47d..b08f5f9ca8bbb 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -1164,13 +1164,14 @@ class ThresholdedReLU(Layer):
             \left\{
                 \begin{array}{rl}
                 x,& \text{if } \ x > threshold \\
-                0,& \text{otherwise}
+                value,& \text{otherwise}
                 \end{array}
             \right.
 
 
     Parameters:
         threshold (float, optional): The value of threshold for ThresholdedReLU. Default is 1.0
+        value (float, optinal): The value to replace with when x is less than threshold. Default is 0.0
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
 
@@ -1191,17 +1192,18 @@ class ThresholdedReLU(Layer):
             [2., 0., 0.])
     """
 
-    def __init__(self, threshold=1.0, name=None):
+    def __init__(self, threshold=1.0, value=0.0, name=None):
         super().__init__()
         self._threshold = threshold
+        self._value = value
         self._name = name
 
     def forward(self, x):
-        return F.thresholded_relu(x, self._threshold, self._name)
+        return F.thresholded_relu(x, self._threshold, self._value, self._name)
 
     def extra_repr(self):
         name_str = f', name={self._name}' if self._name else ''
-        return f'threshold={self._threshold}{name_str}'
+        return f'threshold={self._threshold}, value={self._value}{name_str}'
 
 
 class Silu(Layer):
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index 6faf07bb6eb19..6b34c9fa90f6b 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -1077,6 +1077,67 @@ def extra_repr(self):
         return f'padding={self._pad}, mode={self._mode}, value={self._value}, data_format={self._data_format}{name_str}'
 
 
+class ZeroPad1D(Layer):
+    """
+    This interface is used to construct a callable object of the ``ZeroPad1D`` class.
+    Pads the input tensor boundaries with zero.
+
+    Parameters:
+        padding (Tensor | List[int] | int): The padding size with data type int. If is int, use the
+            same padding in all dimensions. Else [len(padding)/2] dimensions of input will be padded.
+            The pad has the form (pad_left, pad_right).
+        data_format (str): An string from: "NCL", "NLC". Specify the data format of the input data.
+           Default is  "NCL"
+        name (str, optional) : The default value is None.  Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - x(Tensor): The input tensor of zeropad1d operator, which is a 3-D tensor.
+          The data type can be float32, float64.
+        - output(Tensor): The output tensor of zeropad1d operator, which is a 3-D tensor.
+          The data type is same as input x.
+
+    Examples:
+
+        .. code-block:: python
+
+            >>> import paddle
+            >>> import paddle.nn as nn
+
+            >>> input_shape = (1, 2, 3)
+            >>> pad = [1, 2]
+            >>> data = paddle.arange(paddle.prod(paddle.to_tensor(input_shape)), dtype="float32").reshape(input_shape) + 1
+            >>> my_pad = nn.ZeroPad1D(padding=pad)
+            >>> result = my_pad(data)
+            >>> print(result)
+            Tensor(shape=[1, 2, 6], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[0., 1., 2., 3., 0., 0.],
+              [0., 4., 5., 6., 0., 0.]]])
+    """
+
+    def __init__(self, padding, data_format="NCL", name=None):
+        super().__init__()
+        self._pad = _npairs(padding, 1)
+        self._mode = 'constant'
+        self._value = 0.0
+        self._data_format = data_format
+        self._name = name
+
+    def forward(self, x):
+        return F.pad(
+            x,
+            pad=self._pad,
+            mode=self._mode,
+            value=self._value,
+            data_format=self._data_format,
+            name=self._name,
+        )
+
+    def extra_repr(self):
+        name_str = f', name={self._name}' if self._name else ''
+        return f'padding={self._pad}, data_format={self._data_format}{name_str}'
+
+
 class Pad2D(Layer):
     """
     This interface is used to construct a callable object of the ``Pad2D`` class.
@@ -1290,6 +1351,70 @@ def extra_repr(self):
         return f'padding={self._pad}, mode={self._mode}, value={self._value}, data_format={self._data_format}{name_str}'
 
 
+class ZeroPad3D(Layer):
+    """
+    This interface is used to construct a callable object of the ``ZeroPad3D`` class.
+    Pads the input tensor boundaries with zero.
+
+    Parameters:
+        padding (Tensor | List[int] | int): The padding size with data type int. If is int, use the
+            same padding in all dimensions. Else [len(padding)/2] dimensions of input will be padded.
+            The pad has the form (pad_left, pad_right, pad_top, pad_bottom, pad_front, pad_back).
+        data_format (str): An string from: "NCDHW", "NDHWC". Specify the data format of the input data.
+           Default is  "NCDHW"
+        name (str, optional) : The default value is None.  Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - x(Tensor): The input tensor of zeropad3d operator, which is a 5-D tensor.
+          The data type can be float32, float64.
+        - output(Tensor): The output tensor of zeropad3d operator, which is a 5-D tensor.
+          The data type is same as input x.
+
+    Examples:
+
+        .. code-block:: python
+
+            >>> import paddle
+            >>> import paddle.nn as nn
+
+            >>> input_shape = (1, 1, 1, 2, 3)
+            >>> pad = [1, 0, 1, 2, 0, 0]
+            >>> data = paddle.arange(paddle.prod(paddle.to_tensor(input_shape)), dtype="float32").reshape(input_shape) + 1
+            >>> my_pad = nn.ZeroPad3D(padding=pad)
+            >>> result = my_pad(data)
+            >>> print(result)
+            Tensor(shape=[1, 1, 1, 5, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[[[0., 0., 0., 0.],
+                [0., 1., 2., 3.],
+                [0., 4., 5., 6.],
+                [0., 0., 0., 0.],
+                [0., 0., 0., 0.]]]]])
+    """
+
+    def __init__(self, padding, data_format="NCDHW", name=None):
+        super().__init__()
+        self._pad = _npairs(padding, 3)
+        self._mode = 'constant'
+        self._value = 0.0
+        self._data_format = data_format
+        self._name = name
+
+    def forward(self, x):
+        return F.pad(
+            x,
+            pad=self._pad,
+            mode=self._mode,
+            value=self._value,
+            data_format=self._data_format,
+            name=self._name,
+        )
+
+    def extra_repr(self):
+        name_str = f', name={self._name}' if self._name else ''
+        return f'padding={self._pad}, data_format={self._data_format}{name_str}'
+
+
 class CosineSimilarity(Layer):
     """
     This interface is used to compute cosine similarity between x1 and x2 along axis.
diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py
index 23eaf467d916d..3127bb636502f 100755
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -132,6 +132,7 @@ class AvgPool2D(Layer):
             Output(N_i, C_j, h, w)  = \frac{\sum_{m=0}^{ksize[0]-1} \sum_{n=0}^{ksize[1]-1}
                 Input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n)}{ksize[0] * ksize[1]}
 
+
     Parameters:
         kernel_size(int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
             it must contain two integers, (pool_size_Height, pool_size_Width).
@@ -153,7 +154,7 @@ class AvgPool2D(Layer):
         divisor_override(float, optional): If specified, it will be used as divisor, otherwise kernel_size will be
             used. Default None.
         data_format(str, optional): The data format of the input and output data. An optional string from: `"NCHW"`,
-            `"NDHW"`. The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+            `"NHWC"`. The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
             `[batch_size, input_channels, input_height, input_width]`.
         name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
             Usually name is no need to set and None by default.
@@ -321,6 +322,210 @@ def extra_repr(self):
         )
 
 
+class LPPool1D(Layer):
+    r"""
+    Performing a 1D power-average pooling over an input signal composed
+    of several input planes, based on the input, output_size, return_mask parameters.
+    Input(X) and output(Out) are in NCL format, where N is batch
+    size, C is the number of channels, L is the length of the feature.
+    The output tensor shape will be [N, C, output_size].
+
+    The output value of the layer with input size (N, C, L),
+    output (N, C, :math:`L_{out}`) and kernel_size ksize can be precisely described as
+    For average pool1d:
+
+    ..  math::
+
+        Output(N_i, C_i, l) = sum(Input[N_i, C_i, stride \times l:stride \times l+k]^{norm\_type})^{1/norm\_type}
+
+    Parameters:
+        norm_type(int|float): The number the power operation.
+        kernel_size(int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain an integer.
+        stride(int|list|tuple, optional): The pool stride size. If pool stride size is a tuple or list,
+            it must contain an integer. Default None, then stride will be equal to the kernel_size.
+        padding(str|int|list|tuple, optional): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An int, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 1, which means the feature map is zero padded by the size of `padding[0]` on every sides.
+            4. A list[int] or tuple(int) whose length is 2. It has the form [pad_before, pad_after].
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
+        ceil_mode(bool, optional): When True, it will use `ceil` instead of `floor` to compute the output shape. Default: False.
+        data_format(str, optional): The data format of the input and output data. An optional string from: `"NCL"`,
+            `"NLC"`. When it is `"NCL"`, the data is stored in the order of:
+            `[batch_size, input_channels, input_length]`. Default: "NCL"
+        name(str, optional): For eed to detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no nset and None by default.
+
+    Shape:
+        - x(Tensor): The input tensor of lp pool1d operator, which is a 3-D tensor.
+          The data type can be float32, float64.
+        - output(Tensor): The output tensor of lp pool1d  operator, which is a 3-D tensor.
+          The data type is same as input x.
+
+    Returns:
+        A callable object of LPPool1D.
+
+    Examples:
+
+        .. code-block:: python
+
+            >>> import paddle
+            >>> import paddle.nn as nn
+
+            >>> data = paddle.uniform([1, 3, 32], dtype="float32", min=-1, max=1)
+            >>> LPPool1D = nn.LPPool1D(norm_type=2, kernel_size=2, stride=2, padding=0)
+            >>> pool_out = LPPool1D(data)
+            >>> print(pool_out.shape)
+            [1, 3, 16]
+
+    """
+
+    def __init__(
+        self,
+        norm_type,
+        kernel_size,
+        stride=None,
+        padding=0,
+        ceil_mode=False,
+        data_format="NCL",
+        name=None,
+    ):
+        super().__init__()
+        self.norm_type = float(norm_type)
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.ceil_mode = ceil_mode
+        self.data_format = data_format
+        self.name = name
+
+    def forward(self, x):
+        out = F.lp_pool1d(
+            x,
+            self.norm_type,
+            self.kernel_size,
+            self.stride,
+            self.padding,
+            self.ceil_mode,
+            self.data_format,
+            self.name,
+        )
+        return out
+
+    def extra_repr(self):
+        return 'norm_type={norm_type}, kernel_size={kernel_size}, stride={stride}, padding={padding}'.format(
+            **self.__dict__
+        )
+
+
+class LPPool2D(Layer):
+    r"""
+    Performing 2D power-average pooling over input features based on the input,
+    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
+    in NCHW format, where N is batch size, C is the number of channels,
+    H is the height of the feature, and W is the width of the feature.
+
+    Example:
+        Input:
+            X shape: :math:`(N, C, H_{in}, W_{in})`
+        Attr:
+            - kernel_size: kernel_size
+            - norm_type: norm_type
+
+        Output:
+            Out shape: :math:`(N, C, H_{out}, W_{out})`
+
+        ..  math::
+
+            Output(N_i, C_j, h, w)  = (\sum_{m=0}^{ksize[0]-1} \sum_{n=0}^{ksize[1]-1}
+                               Input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n)^{norm\_type})^{1 / norm\_type}
+
+    Parameters:
+        norm_type(int|float): The number the power operation.
+        kernel_size(int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain two integers, (pool_size_Height, pool_size_Width).
+            Otherwise, the pool kernel size will be a square of an int.
+        stride(int|list|tuple, optional): The pool stride size. If pool stride size is a tuple or list,
+            it must contain two integers, (pool_stride_Height, pool_stride_Width).
+            Otherwise, the pool stride size will be a square of an int.
+            Default None, then stride will be equal to the kernel_size.
+        padding(str|int|list|tuple, optional): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An int, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 2, [pad_height, pad_weight] whose value means the padding size of each dimension.
+            4. A list[int] or tuple(int) whose length is 4. [pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
+        ceil_mode(bool, optional): When True, it will use `ceil` instead of `floor` to compute the output shape. Default: False.
+        data_format(str, optional): The data format of the input and output data. An optional string from: `"NCHW"`,
+            `"NHWC"`. When it is `"NCHW"`, the data is stored in the order of:
+            `[batch_size, input_channels, input_height, input_width]`. Default: "NCHW".
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no need to set and None by default.
+
+    Shape:
+        - x(Tensor): The input tensor of lp pool2d operator, which is a 4-D tensor.
+          The data type can be float32, float64.
+        - output(Tensor): The output tensor of lp pool2d  operator, which is a 4-D tensor.
+          The data type is same as input x.
+
+    Returns:
+        A callable object of LPPool2D.
+
+    Examples:
+        .. code-block:: python
+
+            >>> import paddle
+            >>> import paddle.nn as nn
+
+            >>> # lp pool2d
+            >>> input = paddle.uniform([1, 3, 32, 32], dtype="float32", min=-1, max=1)
+            >>> LPPool2D = nn.LPPool2D(norm_type=2, kernel_size=2, stride=2, padding=0)
+            >>> output = LPPool2D(input)
+            >>> print(output.shape)
+            [1, 3, 16, 16]
+
+    """
+
+    def __init__(
+        self,
+        norm_type,
+        kernel_size,
+        stride=None,
+        padding=0,
+        ceil_mode=False,
+        data_format="NCHW",
+        name=None,
+    ):
+        super().__init__()
+        self.norm_type = float(norm_type)
+        self.ksize = kernel_size
+        self.stride = kernel_size if stride is None else stride
+        self.padding = padding
+        self.ceil_mode = ceil_mode
+        self.data_format = data_format
+        self.name = name
+
+    def forward(self, x):
+        return F.lp_pool2d(
+            x,
+            norm_type=self.norm_type,
+            kernel_size=self.ksize,
+            stride=self.stride,
+            padding=self.padding,
+            ceil_mode=self.ceil_mode,
+            data_format=self.data_format,
+            name=self.name,
+        )
+
+    def extra_repr(self):
+        return 'norm_type={norm_type}, kernel_size={ksize}, stride={stride}, padding={padding}'.format(
+            **self.__dict__
+        )
+
+
 class MaxPool1D(Layer):
     """
     This operation applies 1D max pooling over input signal
@@ -458,7 +663,7 @@ class MaxPool2D(Layer):
             The default value is 0.
         ceil_mode(bool, optional): when True, will use `ceil` instead of `floor` to compute the output shape
         return_mask(bool, optional): Whether to return the max indices along with the outputs.
-        data_format(str, optional): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
+        data_format(str, optional): The data format of the input and output data. An optional string from: `"NCHW"`, `"NHWC"`.
             The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
             `[batch_size, input_channels, input_height, input_width]`.
         name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
diff --git a/python/paddle/nn/quant/quantized_linear.py b/python/paddle/nn/quant/quantized_linear.py
index 1c2d962f720cf..41ad1839e1f8a 100644
--- a/python/paddle/nn/quant/quantized_linear.py
+++ b/python/paddle/nn/quant/quantized_linear.py
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from paddle import _C_ops, version
+import paddle
+from paddle import _C_ops
 from paddle.base.data_feeder import check_dtype
 from paddle.base.framework import convert_np_dtype_to_dtype_
 from paddle.device.cuda import get_device_capability
@@ -24,7 +25,7 @@
 
 def _get_arch_info():
     # Get SMVersion from device.
-    cuda_version = version.cuda()
+    cuda_version = paddle.version.cuda()
     if cuda_version is not None and cuda_version != 'False':
         major, minor = get_device_capability()
         arch = int(major * 10 + minor)
diff --git a/python/paddle/profiler/utils.py b/python/paddle/profiler/utils.py
index 1e078d048a5bf..18de1d8fc1940 100644
--- a/python/paddle/profiler/utils.py
+++ b/python/paddle/profiler/utils.py
@@ -12,10 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import functools
 import sys
+import types
 from contextlib import ContextDecorator, contextmanager
-from typing import Any
 from warnings import warn
 
 from paddle.base import core
@@ -82,7 +84,12 @@ def __enter__(self):
         self.begin()
         return self
 
-    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any):
+    def __exit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_value: BaseException | None,
+        traceback: types.TracebackType | None,
+    ):
         self.end()
 
     def begin(self):
diff --git a/python/paddle/sparse/__init__.py b/python/paddle/sparse/__init__.py
index 661143f12dae8..98f5ca0b13ee5 100644
--- a/python/paddle/sparse/__init__.py
+++ b/python/paddle/sparse/__init__.py
@@ -17,6 +17,7 @@
     add,
     divide,
     is_same_shape,
+    mask_as,
     masked_matmul,
     matmul,
     multiply,
@@ -77,6 +78,7 @@
     'expm1',
     'mv',
     'matmul',
+    'mask_as',
     'masked_matmul',
     'addmm',
     'add',
diff --git a/python/paddle/sparse/binary.py b/python/paddle/sparse/binary.py
index 3aac3d5e7f144..abc943ac3c1fc 100644
--- a/python/paddle/sparse/binary.py
+++ b/python/paddle/sparse/binary.py
@@ -452,3 +452,60 @@ def is_same_shape(x, y):
 
     """
     return x.is_same_shape(y)
+
+
+@dygraph_only
+def mask_as(x, mask, name=None):
+    r"""
+    Filter the input dense tensor `x` using the `indices` of the sparse matrix `mask`,
+    which in turn generates a sparse matrix of the corresponding format.
+    The input `x` and `mask` must have the same shape, and the sparse tensor returned has the same indices as `mask`
+    even `zero` values exist in the coresponding indices.
+
+    Args:
+        x (Tensor): The input tensor. It should be a DenseTensor.
+            The data type can be float32, float64, int32, int64, complex64, complex128, int8, int16, float16.
+        mask (Tensor): The input tensor. It can be SparseCooTensor or SparseCsrTensor.
+            It should be 2D or 3D when the mask is SparseCsrTensor.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor: A sparse tensor.
+
+    Examples:
+        .. code-block:: python
+
+            >>> import paddle
+            >>> paddle.set_device('cpu')
+
+            >>> # csr sparse tensor
+            >>> crows = [0, 2, 3, 5]
+            >>> cols = [1, 3, 2, 0, 1]
+            >>> values = [1., 2., 3., 4., 5.]
+            >>> dense_shape = [3, 4]
+            >>> csr = paddle.sparse.sparse_csr_tensor(crows, cols, values, dense_shape)
+            >>> paddle.seed(2024)
+            >>> x = paddle.rand(dense_shape).astype(csr.dtype)
+            >>> out = paddle.sparse.mask_as(x, csr)
+            >>> print(out)
+            Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(cpu), stop_gradient=True,
+            crows=[0, 2, 3, 5],
+            cols=[1, 3, 2, 0, 1],
+            values=[0.23659813, 0.08467803, 0.64152628, 0.66596609, 0.90394485])
+
+            >>> # coo sparse tensor
+            >>> indices = [[0, 1, 2], [1, 2, 0]]
+            >>> values = [1.0, 2.0, 3.0]
+            >>> dense_shape = [3, 3]
+            >>> coo = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape)
+            >>> paddle.seed(2024)
+            >>> x = paddle.rand(dense_shape).astype(coo.dtype)
+            >>> out = paddle.sparse.mask_as(x, coo)
+            >>> print(out)
+            Tensor(shape=[3, 3], dtype=paddle.float32, place=Place(cpu), stop_gradient=True,
+            indices=[[0, 1, 2],
+                     [1, 2, 0]],
+            values=[0.23659813, 0.40340215, 0.64152628])
+
+    """
+    return _C_ops.sparse_mask_as(x, mask)
diff --git a/python/paddle/static/io.py b/python/paddle/static/io.py
index 469145ac6a832..8039c1ead6478 100644
--- a/python/paddle/static/io.py
+++ b/python/paddle/static/io.py
@@ -60,11 +60,12 @@
 )
 from .pir_io import (
     get_pir_parameters,
+    load_inference_model_pir,
     load_pir,
-    load_pir_inference_model,
     load_vars_pir,
+    normalize_pir_program,
+    save_inference_model_pir,
     save_pir,
-    save_pir_inference_model,
     save_vars_pir,
 )
 
@@ -183,6 +184,8 @@ def normalize_program(program, feed_vars, fetch_vars, **kwargs):
             >>> normalized_program = paddle.static.normalize_program(program, [image], [predict])
 
     """
+    if in_pir_mode():
+        return normalize_pir_program(program, feed_vars, fetch_vars, **kwargs)
     if not isinstance(program, Program):
         raise TypeError(
             "program type must be `base.Program`, but received `%s`"
@@ -523,7 +526,7 @@ def save_inference_model(
     """
 
     if in_pir_mode():
-        save_pir_inference_model(
+        save_inference_model_pir(
             path_prefix, feed_vars, fetch_vars, executor, **kwargs
         )
         return
@@ -849,7 +852,7 @@ def load_inference_model(path_prefix, executor, **kwargs):
             # program to get the inference result.
     """
     if in_pir_mode():
-        return load_pir_inference_model(path_prefix, executor, **kwargs)
+        return load_inference_model_pir(path_prefix, executor, **kwargs)
     # check kwargs
     supported_args = ('model_filename', 'params_filename')
     deprecated_args = ('pserver_endpoints',)
diff --git a/python/paddle/static/nn/metric.py b/python/paddle/static/nn/metric.py
index d2252ebc0a0bc..2be2cecf18742 100644
--- a/python/paddle/static/nn/metric.py
+++ b/python/paddle/static/nn/metric.py
@@ -245,6 +245,28 @@ def auc(
             [array(1.)]
 
     """
+    if in_pir_mode():
+        if ins_tag_weight is None:
+            ins_tag_weight = paddle.full(
+                shape=[1, 1], dtype="float32", fill_value=1.0
+            )
+        stat_pos = paddle.zeros(shape=[1, num_thresholds + 1], dtype="int64")
+        stat_neg = paddle.zeros(shape=[1, num_thresholds + 1], dtype="int64")
+        auc_out, batch_stat_pos, batch_stat_neg = _C_ops.auc(
+            input,
+            label,
+            stat_pos,
+            stat_neg,
+            ins_tag_weight,
+            curve,
+            num_thresholds,
+            slide_steps,
+        )
+        return (
+            auc_out,
+            batch_stat_pos,
+            batch_stat_neg,
+        )
     helper = LayerHelper("auc", **locals())
 
     if ins_tag_weight is None:
diff --git a/python/paddle/static/pir_io.py b/python/paddle/static/pir_io.py
index 38e5e69cfdbb1..ffbf75dfdbb26 100644
--- a/python/paddle/static/pir_io.py
+++ b/python/paddle/static/pir_io.py
@@ -90,18 +90,18 @@ def set_var(name, ndarray):
     p = t._place()
     if p.is_cpu_place():
         place = paddle.base.CPUPlace()
-    # elif p.is_cuda_pinned_place():
-    #     place = paddle.base.CUDAPinnedPlace()
-    # elif p.is_xpu_place():
-    #     p = paddle.base.core.Place()
-    #     p.set_place(t._place())
-    #     place = paddle.base.XPUPlace(p.xpu_device_id())
-    # elif p.is_custom_place():
-    #     p = paddle.base.core.Place()
-    #     p.set_place(t._place())
-    #     place = paddle.base.CustomPlace(
-    #         paddle.device.get_device().split(':')[0], p.custom_device_id()
-    #     )
+    elif p.is_cuda_pinned_place():
+        place = paddle.base.CUDAPinnedPlace()
+    elif p.is_xpu_place():
+        p = paddle.base.core.Place()
+        p.set_place(t._place())
+        place = paddle.base.XPUPlace(p.xpu_device_id())
+    elif p.is_custom_place():
+        p = paddle.base.core.Place()
+        p.set_place(t._place())
+        place = paddle.base.CustomPlace(
+            paddle.device.get_device().split(':')[0], p.custom_device_id()
+        )
     else:
         p = paddle.base.core.Place()
         p.set_place(t._place())
@@ -251,7 +251,13 @@ def normalize_pir_program(program, feed_vars, fetch_vars, **kwargs):
     if not all(isinstance(v, pir.Value) for v in fetch_vars):
         raise TypeError("fetch_vars type must be a Value or a list of Value.")
 
-    # TODO(Ruting) remind users to set auc_states to 0 if auc op were found.
+    # remind users to set auc_states to 0 if auc op were found.
+    for op in program.global_block().ops:
+        if op.name() == 'pd_op.auc':
+            warnings.warn(
+                "Be sure that you have set auc states to 0 before saving inference model."
+            )
+            break
 
     # fix the bug that the activation op's output as target will be pruned.
     # will affect the inference performance.
@@ -632,8 +638,8 @@ def load_pir(program, model_path, executor=None, var_list=None):
         model_prefix = model_prefix[:-9]
     elif model_prefix.endswith(".pdopt"):
         model_prefix = model_prefix[:-6]
-    elif model_prefix.endswith(".pdmodel"):
-        model_prefix = model_prefix[:-8]
+    elif model_prefix.endswith(".json"):
+        model_prefix = model_prefix[:-5]
 
     parameter_file_name = model_prefix + ".pdparams"
 
@@ -677,7 +683,7 @@ def load_pir(program, model_path, executor=None, var_list=None):
 
 
 @static_only
-def save_pir_inference_model(
+def save_inference_model_pir(
     path_prefix, feed_vars, fetch_vars, executor, **kwargs
 ):
     """
@@ -752,7 +758,7 @@ def save_pir_inference_model(
 
 
 @static_only
-def load_pir_inference_model(path_prefix, executor, **kwargs):
+def load_inference_model_pir(path_prefix, executor, **kwargs):
     """
 
     Load inference model from a given path. By this API, you can get the model
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index 4de5e392a8493..553ea2cc5bbee 100644
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -142,6 +142,7 @@
     atleast_1d,
     atleast_2d,
     atleast_3d,
+    block_diag,
     broadcast_tensors,
     broadcast_to,
     cast,
@@ -306,6 +307,7 @@
     inner,
     inverse,
     isfinite,
+    isin,
     isinf,
     isnan,
     isneginf,
@@ -544,6 +546,7 @@
     'hypot_',
     'nansum',
     'nanmean',
+    'block_diag',
     'count_nonzero',
     'tanh',
     'tanh_',
@@ -587,6 +590,7 @@
     'kron',
     'kthvalue',
     'isfinite',
+    'isin',
     'isinf',
     'isnan',
     'isneginf',
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 24c60af7499e6..6ef53a757ee23 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: define functions to get create a tensor
+from __future__ import annotations
 
 import math
 import re
@@ -21,6 +21,12 @@
 
 import paddle
 from paddle import _C_ops
+from paddle._typing import (
+    DTypeLike,
+    NestedNumbericSequence,
+    PlaceLike,
+    TensorLike,
+)
 from paddle.utils.inplace_utils import inplace_apis_in_dygraph_only
 
 from ..base.data_feeder import (
@@ -719,7 +725,12 @@ def _to_tensor_static(data, dtype=None, stop_gradient=None):
     return output
 
 
-def to_tensor(data, dtype=None, place=None, stop_gradient=True):
+def to_tensor(
+    data: TensorLike | NestedNumbericSequence,
+    dtype: DTypeLike | None = None,
+    place: PlaceLike | None = None,
+    stop_gradient: bool = True,
+) -> paddle.Tensor:
     r"""
     Constructs a ``paddle.Tensor`` from ``data`` ,
     which can be scalar, tuple, list, numpy\.ndarray, paddle\.Tensor.
@@ -1644,7 +1655,7 @@ def meshgrid(*args, **kwargs):
 
     Args:
         *args(Tensor|list of Tensor) : tensors (tuple(list) of tensor): the shapes of input k tensors are (N1,),
-            (N2,),..., (Nk,). Support data types: ``float64``, ``float16``, ``float32``, ``int32``, ``int64``.
+            (N2,),..., (Nk,). Support data types: ``float64``, ``bfloat16``, ``float16``, ``float32``, ``int32``, ``int64``, ``complex64``, ``complex128``.
         **kwargs (optional): Currently, only accept name in **kwargs
             The default value is None. Normally there is no need for
             user to set this property. For more information, please refer to :ref:`api_guide_Name`.
@@ -1686,7 +1697,16 @@ def meshgrid(*args, **kwargs):
             check_dtype(
                 input_.dtype,
                 'create data type',
-                ['uint16', 'float16', 'float32', 'float64', 'int32', 'int64'],
+                [
+                    'uint16',
+                    'float16',
+                    'float32',
+                    'float64',
+                    'int32',
+                    'int64',
+                    'complex64',
+                    'complex128',
+                ],
                 'meshgrid',
             )
 
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 9803d4a8c5c0a..c99f46677d679 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -3971,7 +3971,7 @@ def tile(x, repeat_times, name=None):
     Both the number of dimensions of ``x`` and the number of elements in ``repeat_times`` should be less than or equal to 6.
 
     Args:
-        x (Tensor): The input tensor, its data type should be bool, float16, float32, float64, int32 or int64.
+        x (Tensor): The input tensor, its data type should be bool, float16, float32, float64, int32, int64, complex64 or complex128.
         repeat_times (list|tuple|Tensor): The number of repeating times. If repeat_times is a list or tuple, all its elements
             should be integers or 1-D Tensors with the data type int32. If repeat_times is a Tensor, it should be an 1-D Tensor with the data type int32.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
@@ -4038,6 +4038,8 @@ def check_input(x, repeat_times):
                 'float64',
                 'int32',
                 'int64',
+                'complex64',
+                'complex128',
             ],
             'tile',
         )
@@ -4209,7 +4211,7 @@ def expand(x, shape, name=None):
     Both the number of dimensions of ``x`` and the number of elements in ``shape`` should be less than or equal to 6. And the number of dimensions of ``x`` should be less than the number of elements in ``shape``. The dimension to expand must have a value 0.
 
     Args:
-        x (Tensor): The input Tensor, its data type is bool, float16, float32, float64, int32, int64, uint8 or uint16.
+        x (Tensor): The input Tensor, its data type is bool, float16, float32, float64, int32, int64, uint8, uint16, complex64 or complex128.
         shape (list|tuple|Tensor): The result shape after expanding. The data type is int32. If shape is a list or tuple, all its elements
             should be integers or 0-D or 1-D Tensors with the data type int32. If shape is a Tensor, it should be an 1-D Tensor with the data type int32.
             The value -1 in shape means keeping the corresponding dimension unchanged.
@@ -4275,6 +4277,8 @@ def expand(x, shape, name=None):
                 'int64',
                 'uint8',
                 'uint16',
+                'complex64',
+                'complex128',
             ],
             'expand',
         )
@@ -6861,3 +6865,67 @@ def slice_scatter(x, value, axes, starts, ends, strides, name=None):
         )
 
         return output
+
+
+def block_diag(inputs, name=None):
+    """
+    Create a block diagonal matrix from provided tensors.
+
+    Args:
+        inputs (list|tuple): ``inputs`` is a Tensor list or Tensor tuple, one or more tensors with 0, 1, or 2 dimensions. The data type: ``bool``, ``float16``, ``float32``, ``float64``, ``uint8``, ``int8``, ``int16``, ``int32``, ``int64``, ``bfloat16``, ``complex64``, ``complex128``.
+        name (str, optional): Name for the operation (optional, default is None).
+
+    Returns:
+        Tensor, A ``Tensor``. The data type is same as ``inputs``.
+
+    Examples:
+        .. code-block:: python
+
+            >>> import paddle
+
+            >>> A = paddle.to_tensor([[4], [3], [2]])
+            >>> B = paddle.to_tensor([7, 6, 5])
+            >>> C = paddle.to_tensor(1)
+            >>> D = paddle.to_tensor([[5, 4, 3], [2, 1, 0]])
+            >>> E = paddle.to_tensor([[8, 7], [7, 8]])
+            >>> out = paddle.block_diag([A, B, C, D, E])
+            >>> print(out)
+            Tensor(shape=[9, 10], dtype=int64, place=Place(gpu:0), stop_gradient=True,
+                [[4, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                [3, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                [2, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                [0, 7, 6, 5, 0, 0, 0, 0, 0, 0],
+                [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
+                [0, 0, 0, 0, 0, 5, 4, 3, 0, 0],
+                [0, 0, 0, 0, 0, 2, 1, 0, 0, 0],
+                [0, 0, 0, 0, 0, 0, 0, 0, 8, 7],
+                [0, 0, 0, 0, 0, 0, 0, 0, 7, 8]])
+    """
+
+    def to_col_block(arys, i, a):
+        return [
+            a
+            if idx == i
+            else paddle.zeros([ary.shape[0], a.shape[1]], dtype=a.dtype)
+            for idx, ary in enumerate(arys)
+        ]
+
+    def to_2d(ary):
+        if ary.ndim == 0:
+            return ary.unsqueeze(axis=0).unsqueeze(axis=0)
+        if ary.ndim == 1:
+            return ary.unsqueeze(axis=0)
+        if ary.ndim == 2:
+            return ary
+        raise ValueError(
+            "For 'block_diag', the dimension of each elements in 'inputs' must be 0, 1, or 2, but got "
+            f"{ary.ndim}"
+        )
+
+    arys = [to_2d(ary) for ary in inputs]
+
+    matrix = [
+        paddle.concat(to_col_block(arys, idx, ary), axis=0)
+        for idx, ary in enumerate(arys)
+    ]
+    return paddle.concat(matrix, axis=1)
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index d7d8669ff0c3b..3df4cf88c94b6 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -2726,7 +2726,7 @@ def inverse(x, name=None):
         x (Tensor): The input tensor. The last two
             dimensions should be equal. When the number of dimensions is
             greater than 2, it is treated as batches of square matrix. The data
-            type can be float32 and float64.
+            type can be float32, float64, complex64, complex128.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -2751,7 +2751,12 @@ def inverse(x, name=None):
     else:
 
         def _check_input(x):
-            check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'inverse')
+            check_variable_and_dtype(
+                x,
+                'x',
+                ['float32', 'float64', 'complex64', 'complex128'],
+                'inverse',
+            )
             if len(x.shape) < 2:
                 raise ValueError(
                     "The input of inverse is expected to be a Tensor whose number "
@@ -7969,3 +7974,187 @@ def sinc_(x, name=None):
     paddle.sin_(x)
     paddle.divide_(x, tmp)
     return paddle.where(~paddle.isnan(x), x, paddle.full_like(x, 1.0))
+
+
+def isin(x, test_x, assume_unique=False, invert=False, name=None):
+    r"""
+    Tests if each element of `x` is in `test_x`.
+
+    Args:
+        x (Tensor): The input Tensor. Supported data type: 'bfloat16', 'float16', 'float32', 'float64', 'int32', 'int64'.
+        test_x (Tensor): Tensor values against which to test for each input element. Supported data type: 'bfloat16', 'float16', 'float32', 'float64', 'int32', 'int64'.
+        assume_unique (bool, optional): If True, indicates both `x` and `test_x` contain unique elements, which could make the calculation faster. Default: False.
+        invert (bool, optional): Indicate whether to invert the boolean return tensor. If True, invert the results. Default: False.
+        name (str, optional): Name for the operation (optional, default is None).For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        out (Tensor), The output Tensor with the same shape as `x`.
+
+    Examples:
+        .. code-block:: python
+
+            >>> import paddle
+            >>> paddle.set_device('cpu')
+            >>> x = paddle.to_tensor([-0., -2.1, 2.5, 1.0, -2.1], dtype='float32')
+            >>> test_x = paddle.to_tensor([-2.1, 2.5], dtype='float32')
+            >>> res = paddle.isin(x, test_x)
+            >>> print(res)
+            Tensor(shape=[5], dtype=bool, place=Place(cpu), stop_gradient=True,
+            [False, True, True, False, True])
+
+            >>> x = paddle.to_tensor([-0., -2.1, 2.5, 1.0, -2.1], dtype='float32')
+            >>> test_x = paddle.to_tensor([-2.1, 2.5], dtype='float32')
+            >>> res = paddle.isin(x, test_x, invert=True)
+            >>> print(res)
+            Tensor(shape=[5], dtype=bool, place=Place(cpu), stop_gradient=True,
+            [True, False, False, True, False])
+
+            >>> # Set `assume_unique` to True only when `x` and `test_x` contain unique values, otherwise the result may be incorrect.
+            >>> x = paddle.to_tensor([0., 1., 2.]*20).reshape([20, 3])
+            >>> test_x = paddle.to_tensor([0., 1.]*20)
+            >>> correct_result = paddle.isin(x, test_x, assume_unique=False)
+            >>> print(correct_result)
+            Tensor(shape=[20, 3], dtype=bool, place=Place(cpu), stop_gradient=True,
+            [[True , True , False],
+             [True , True , False],
+             [True , True , False],
+             [True , True , False],
+             [True , True , False],
+             [True , True , False],
+             [True , True , False],
+             [True , True , False],
+             [True , True , False],
+             [True , True , False],
+             [True , True , False],
+             [True , True , False],
+             [True , True , False],
+             [True , True , False],
+             [True , True , False],
+             [True , True , False],
+             [True , True , False],
+             [True , True , False],
+             [True , True , False],
+             [True , True , False]])
+
+            >>> incorrect_result = paddle.isin(x, test_x, assume_unique=True)
+            >>> print(incorrect_result)
+            Tensor(shape=[20, 3], dtype=bool, place=Place(gpu:0), stop_gradient=True,
+            [[True , True , True ],
+             [True , True , True ],
+             [True , True , True ],
+             [True , True , True ],
+             [True , True , True ],
+             [True , True , True ],
+             [True , True , True ],
+             [True , True , True ],
+             [True , True , True ],
+             [True , True , True ],
+             [True , True , True ],
+             [True , True , True ],
+             [True , True , True ],
+             [True , True , True ],
+             [True , True , True ],
+             [True , True , True ],
+             [True , True , True ],
+             [True , True , True ],
+             [True , True , True ],
+             [True , True , False]])
+
+    """
+    if not isinstance(x, (paddle.Tensor, Variable, paddle.pir.Value)):
+        raise TypeError(f"x must be tensor type, but got {type(x)}")
+    if not isinstance(test_x, (paddle.Tensor, Variable, paddle.pir.Value)):
+        raise TypeError(f"x must be tensor type, but got {type(test_x)}")
+
+    check_variable_and_dtype(
+        x,
+        "x",
+        [
+            'uint16',
+            'float16',
+            'float32',
+            'float64',
+            'int32',
+            'int64',
+        ],
+        "isin",
+    )
+
+    check_variable_and_dtype(
+        test_x,
+        "test_x",
+        [
+            'uint16',
+            'float16',
+            'float32',
+            'float64',
+            'int32',
+            'int64',
+        ],
+        "isin",
+    )
+
+    x_zero_dim = False
+    if len(x.shape) == 0:
+        x = x.reshape([1])
+        x_zero_dim = True
+
+    size_x = math.prod(x.shape)
+    size_t = math.prod(test_x.shape)
+    if size_t < math.pow(size_x, 0.145) * 10.0:
+        # use brute-force searching if the test_x size is small
+        if len(x.shape) == 0:
+            return paddle.zeros([], dtype='bool')
+
+        tmp = x.reshape(tuple(x.shape) + ((1,) * test_x.ndim))
+        cmp = tmp == test_x
+        dim = tuple(range(-1, -test_x.ndim - 1, -1))
+        cmp = cmp.any(axis=dim)
+        if invert:
+            cmp = ~cmp
+    else:
+        x_flat = x.flatten()
+        test_x_flat = test_x.flatten()
+        if assume_unique:
+            # if x and test_x both contain unique elements, use stable argsort method which could be faster
+            all_elements = paddle.concat([x_flat, test_x_flat])
+            sorted_index = paddle.argsort(all_elements, stable=True)
+            sorted_x = all_elements[sorted_index]
+
+            duplicate_mask = paddle.full_like(sorted_index, False, dtype='bool')
+            if not in_dynamic_mode():
+                duplicate_mask = paddle.static.setitem(
+                    duplicate_mask,
+                    paddle.arange(duplicate_mask.numel() - 1),
+                    sorted_x[1:] == sorted_x[:-1],
+                )
+            else:
+                duplicate_mask[:-1] = sorted_x[1:] == sorted_x[:-1]
+
+            if invert:
+                duplicate_mask = duplicate_mask.logical_not()
+
+            mask = paddle.empty_like(duplicate_mask)
+            if not in_dynamic_or_pir_mode():
+                mask = paddle.static.setitem(mask, sorted_index, duplicate_mask)
+            else:
+                mask[sorted_index] = duplicate_mask
+
+            cmp = mask[0 : x.numel()].reshape(x.shape)
+        else:
+            # otherwise use searchsorted method
+            sorted_test_x = paddle.sort(test_x_flat)
+            idx = paddle.searchsorted(sorted_test_x, x_flat)
+            test_idx = paddle.where(
+                idx < sorted_test_x.numel(),
+                idx,
+                paddle.zeros_like(idx, 'int64'),
+            )
+            cmp = sorted_test_x[test_idx] == x_flat
+            cmp = cmp.logical_not() if invert else cmp
+            cmp = cmp.reshape(x.shape)
+
+    if x_zero_dim:
+        return cmp.reshape([])
+    else:
+        return cmp
diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index 736ae891f2fb8..9ec4cd1e2ec7f 100755
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -130,6 +130,7 @@ def argsort(x, axis=-1, descending=False, stable=False, name=None):
             x,
             'x',
             [
+                'uint16',
                 'float16',
                 'float32',
                 'float64',
diff --git a/python/paddle/tensor/tensor.prototype.pyi b/python/paddle/tensor/tensor.prototype.pyi
index 735c8da282545..ffc870d34cb7a 100644
--- a/python/paddle/tensor/tensor.prototype.pyi
+++ b/python/paddle/tensor/tensor.prototype.pyi
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# The `Tensor` template for `tools/gen_tensor_stub.py` generates the stub file `tensor.pyi`.
-# Add docstring, attributes, methods and alias with type annotaions for `Tensor`
+# The `Tensor` template `tensor.prototype.pyi` for `tools/gen_tensor_stub.py` to generate the stub file `tensor.pyi`.
+# Add docstring, attributes, methods and alias with type annotaions for `Tensor` in `tensor.prototype.pyi`
 # if not conveniently coding in original place (like c++ source file).
 
-from typing import Any, overload
+from typing import Any, Literal, overload
 
 import numpy.typing as npt
 from typing_extensions import TypeAlias
@@ -180,7 +180,7 @@ class Tensor:
             | tuple[None | bool | int | _Slice, ...]
             | list[Tensor | bool | int]
         ),
-        value: Tensor | npt.NDArray[Any] | int | float | complex | bool,
+        value: Tensor | npt.NDArray[Any] | complex | bool,
     ) -> None: ...
     def __len__(self) -> int: ...
 
@@ -260,4 +260,4 @@ class Tensor:
     def type(self) -> Any: ...
 
     # annotation: ${tensor_alias}
-    __qualname__ = "Tensor"
+    __qualname__: Literal["Tensor"]
diff --git a/python/paddle/utils/deprecated.py b/python/paddle/utils/deprecated.py
index 5118460f2ad66..45fb1a89e0903 100755
--- a/python/paddle/utils/deprecated.py
+++ b/python/paddle/utils/deprecated.py
@@ -34,8 +34,6 @@ class VisibleDeprecationWarning(UserWarning):
     See more details from https://peps.python.org/pep-0565/
     """
 
-    ...
-
 
 def deprecated(update_to="", since="", reason="", level=0):
     """Decorate a function to signify its deprecation.
diff --git a/python/paddle/utils/environments.py b/python/paddle/utils/environments.py
index 7054dd1cc43a9..256fa9669be2d 100644
--- a/python/paddle/utils/environments.py
+++ b/python/paddle/utils/environments.py
@@ -17,6 +17,8 @@
 import os
 from typing import Generic, TypeVar
 
+from typing_extensions import Self
+
 T = TypeVar("T")
 
 
@@ -100,7 +102,7 @@ def __init__(self, variable: EnvironmentVariable[T], value: T):
         self.original_value = variable.get()
         self.variable.set(value)
 
-    def __enter__(self) -> EnvironmentVariableGuard:
+    def __enter__(self) -> Self:
         return self
 
     def __exit__(self, exc_type, exc_value, traceback) -> None:
diff --git a/python/paddle/utils/layers_utils.py b/python/paddle/utils/layers_utils.py
index 99a3f122b56d7..42587601f09cb 100644
--- a/python/paddle/utils/layers_utils.py
+++ b/python/paddle/utils/layers_utils.py
@@ -13,12 +13,15 @@
 # limitations under the License.
 
 import copy
+import typing
 from collections import defaultdict
 from collections.abc import Sequence
+from typing import Any, Dict, TypeVar, Union
 from uuid import uuid4
 from weakref import WeakKeyDictionary
 
 import numpy as np
+from typing_extensions import TypeGuard
 
 import paddle
 from paddle.pir.core import convert_np_dtype_to_dtype_
@@ -31,6 +34,12 @@
 )
 from ..pir import Value
 
+_T = TypeVar("_T")
+
+Structure = Union[
+    _T, Dict[str, "Structure[_T]"], typing.Sequence["Structure[_T]"]
+]
+
 
 def convert_to_list(value, n, name, dtype=int):
     """
@@ -102,7 +111,7 @@ def convert_to_list(value, n, name, dtype=int):
         return value_list
 
 
-def is_sequence(seq):
+def is_sequence(seq: Any) -> TypeGuard[typing.Sequence[Any]]:
     """
     Whether `seq` is an entry or nested structure
     """
@@ -164,7 +173,7 @@ def to_sequence(nest):
         return [nest]
 
 
-def flatten(nest):
+def flatten(nest: Structure[_T]) -> typing.Sequence[_T]:
     """
         :alias_main: paddle.flatten
         :alias: paddle.flatten,paddle.tensor.flatten,paddle.tensor.manipulation.flatten
diff --git a/python/requirements.txt b/python/requirements.txt
index ada631fed6814..3b94629a108ae 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -1,5 +1,5 @@
 httpx
-numpy>=1.13, <2.0
+numpy>=1.13
 protobuf>=3.20.2 
 Pillow
 decorator
diff --git a/python/setup.py.in b/python/setup.py.in
index 67d23a089aa37..ae5a7c0bdc5f6 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -710,6 +710,12 @@ package_data['paddle.libs']= []
 if('${WITH_SHARED_PHI}' == 'ON'):
     package_data['paddle.libs'] += [('libphi' if os.name != 'nt' else 'phi') + ext_name]
     shutil.copy('${PHI_LIB}', libs_path)
+    if('${PHI_KERNEL_GPU_LIB}'):
+        package_data['paddle.libs'] += [
+            ('libphi_kernel_gpu' if os.name != 'nt' else 'phi_kernel_gpu')
+            + ext_name
+        ]
+        shutil.copy('${PHI_KERNEL_GPU_LIB}', libs_path)
 
 if('${WITH_SHARED_IR}' == 'ON'):
     package_data['paddle.libs'] += [('libpir' if os.name != 'nt' else 'pir') + ext_name]
@@ -1054,6 +1060,36 @@ if '${WITH_STRIP}' == 'ON':
     if os.system(command) != 0:
         raise Exception("strip *.so failed, command: %s" % command)
 
+
+def check_build_dependency():
+    missing_modules = '''Missing build dependency: {dependency}
+Please run 'pip install -r python/requirements.txt' to make sure you have all the dependencies installed.
+'''.strip()
+
+    with open('${PADDLE_SOURCE_DIR}' + '/python/requirements.txt') as f:
+        build_dependencies = (
+            f.read().splitlines()
+        )  # Specify the dependencies to install
+
+    python_dependencies_module = []
+    installed_packages = []
+
+    for dependency in build_dependencies:
+        python_dependencies_module.append(
+            re.sub("_|-", '', re.sub(r"==.*|>=.*|<=.*", '', dependency))
+        )
+    reqs = subprocess.check_output([sys.executable, '-m', 'pip', 'freeze'])
+
+    for r in reqs.split():
+        installed_packages.append(
+            re.sub("_|-", '', r.decode().split('==')[0]).lower()
+        )
+
+    for dependency in python_dependencies_module:
+        if dependency.lower() not in installed_packages:
+            raise RuntimeError(missing_modules.format(dependency=dependency))
+
+
 def install_cpp_dist_and_build_test(paddle_install_dir, paddle_lib_test_dir):
     """install cpp distribution and build test target
 
@@ -1095,6 +1131,9 @@ def install_cpp_dist_and_build_test(paddle_install_dir, paddle_lib_test_dir):
     subprocess.check_call(["cmake", "--build", paddle_lib_test_dir])
 
 
+# check build dependency
+check_build_dependency()
+
 # install cpp distribution
 if '${WITH_CPP_DIST}' == 'ON':
     paddle_install_dir = '${PADDLE_INSTALL_DIR}'
@@ -1112,6 +1151,28 @@ package_data['paddle.base'] = package_data.get('paddle.base', []) + [
 package_data['paddle.tensor'] = package_data.get('paddle.tensor', []) + ['tensor.pyi']
 
 
+def generate_tensor_stub(paddle_binary_dir, paddle_source_dir):
+    print('-'*2, 'Generate stub file tensor.pyi ... ')
+    script_path = paddle_source_dir + '/tools/'
+    sys.path.append(script_path)
+    import gen_tensor_stub
+
+    gen_tensor_stub.generate_stub_file(
+        input_file=paddle_source_dir
+        + '/python/paddle/tensor/tensor.prototype.pyi',
+        output_file=paddle_binary_dir + '/python/paddle/tensor/tensor.pyi',
+    )
+
+    shutil.copy(
+        paddle_binary_dir + '/python/paddle/tensor/tensor.pyi',
+        paddle_source_dir + '/python/paddle/tensor/tensor.pyi',
+    )
+    print('-'*2, 'End Generate stub file tensor.pyi ... ')
+
+# generate stub file `tensor.pyi`
+generate_tensor_stub('${PADDLE_BINARY_DIR}', '${PADDLE_SOURCE_DIR}')
+
+
 with redirect_stdout():
     setup(name='${PACKAGE_NAME}',
         version='${PADDLE_VERSION}',
diff --git a/python/setup_cinn.py.in b/python/setup_cinn.py.in
index 597e9b9187f6c..aa68da69a9f7c 100644
--- a/python/setup_cinn.py.in
+++ b/python/setup_cinn.py.in
@@ -220,10 +220,10 @@ if platform.system() == 'Linux' and platform.machine() == 'x86_64':
             cuda_major_version = version.split('.')[0]
         except Exception as e:
             raise ValueError("CUDA not found")
-        
+
         install_requires.append(PADDLE_CUDA_INSTALL_REQUIREMENTS[cuda_major_version].split("|"))
-        
-        
+
+
 
 with redirect_stdout():
     setup(
diff --git a/python/unittest_py/requirements.txt b/python/unittest_py/requirements.txt
index 15cf679177709..113283aff3500 100644
--- a/python/unittest_py/requirements.txt
+++ b/python/unittest_py/requirements.txt
@@ -2,7 +2,7 @@ PyGithub
 coverage==5.5
 pycrypto ; platform_system != "Windows"
 mock
-gym==0.26.2
+gymnasium==0.29.1
 pygame==2.5.2
 hypothesis
 opencv-python<=4.2.0.32
@@ -19,3 +19,4 @@ wandb>=0.13 ; python_version<"3.12"
 xlsxwriter==3.0.9
 xdoctest==1.1.1
 ubelt==1.3.3 # just for xdoctest
+mypy==1.10.0
diff --git a/r/Dockerfile b/r/Dockerfile
index 2605e98f7684d..f2fa52082ba96 100644
--- a/r/Dockerfile
+++ b/r/Dockerfile
@@ -30,7 +30,7 @@ RUN echo "channels:" >> ~/.condarc && \
     echo "  simpleitk: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud" >> ~/.condarc && \
     echo "show_channel_urls: true" >> ~/.condarc && \
     echo "channel_priority: strict" >> ~/.condarc
-    
+
 # Install R
 RUN conda install -y r -c conda-forge
 
@@ -44,4 +44,4 @@ RUN Rscript -e 'install.packages("reticulate", repos="https://cran.rstudio.com")
 COPY example example
 RUN cd example && \
     curl -O https://paddle-inference-dist.cdn.bcebos.com/mobilenet-test-model-data.tar.gz && \
-    tar -zxvf mobilenet-test-model-data.tar.gz && rm mobilenet-test-model-data.tar.gz 
+    tar -zxvf mobilenet-test-model-data.tar.gz && rm mobilenet-test-model-data.tar.gz
diff --git a/setup.py b/setup.py
index aab6fe0bcfd82..d2f93f15249c5 100644
--- a/setup.py
+++ b/setup.py
@@ -1061,6 +1061,12 @@ def get_package_data_and_package_dir():
             ('libphi' if os.name != 'nt' else 'phi') + ext_suffix
         ]
         shutil.copy(env_dict.get("PHI_LIB"), libs_path)
+        if env_dict.get("PHI_KERNEL_GPU_LIB"):
+            package_data['paddle.libs'] += [
+                ('libphi_kernel_gpu' if os.name != 'nt' else 'phi_kernel_gpu')
+                + ext_suffix
+            ]
+            shutil.copy(env_dict.get("PHI_KERNEL_GPU_LIB"), libs_path)
 
     if env_dict.get("WITH_SHARED_IR") == "ON":
         package_data['paddle.libs'] += [
@@ -1796,6 +1802,25 @@ def submodules_not_exists_or_empty(folder):
             sys.exit(1)
 
 
+def generate_tensor_stub(paddle_binary_dir, paddle_source_dir):
+    print('-' * 2, 'Generate stub file tensor.pyi ... ')
+    script_path = paddle_source_dir + '/tools/'
+    sys.path.append(script_path)
+    import gen_tensor_stub
+
+    gen_tensor_stub.generate_stub_file(
+        input_file=paddle_source_dir
+        + '/python/paddle/tensor/tensor.prototype.pyi',
+        output_file=paddle_binary_dir + '/python/paddle/tensor/tensor.pyi',
+    )
+
+    shutil.copy(
+        paddle_binary_dir + '/python/paddle/tensor/tensor.pyi',
+        paddle_source_dir + '/python/paddle/tensor/tensor.pyi',
+    )
+    print('-' * 2, 'End Generate stub file tensor.pyi ... ')
+
+
 def main():
     # Parse the command line and check arguments before we proceed with building steps and setup
     parse_input_command(filter_args_list)
@@ -1875,6 +1900,9 @@ def main():
             package_data['paddle.libs'],
         )
 
+    # generate stub file `tensor.pyi`
+    generate_tensor_stub(paddle_binary_dir, paddle_source_dir)
+
     setup(
         name=package_name,
         version=paddle_version,
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index f732dad9e7f54..9fd22a6cf8b46 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -227,6 +227,10 @@ if(${len} GREATER_EQUAL 1)
       target_link_libraries(${test_name} $<TARGET_LINKER_FILE:${paddle_lib}>)
       if(WITH_SHARED_PHI)
         target_link_libraries(${test_name} $<TARGET_LINKER_FILE:phi>)
+        if(WITH_GPU OR WITH_ROCM)
+          target_link_libraries(${test_name}
+                                $<TARGET_LINKER_FILE:phi_kernel_gpu>)
+        endif()
       endif()
       if(WITH_SHARED_IR)
         target_link_libraries(${test_name} $<TARGET_LINKER_FILE:pir>)
diff --git a/test/deprecated/amp/test_collect_operator_stats.py b/test/amp/test_collect_operator_stats.py
similarity index 94%
rename from test/deprecated/amp/test_collect_operator_stats.py
rename to test/amp/test_collect_operator_stats.py
index 8b1d4f021a96d..80e592414e016 100644
--- a/test/deprecated/amp/test_collect_operator_stats.py
+++ b/test/amp/test_collect_operator_stats.py
@@ -157,11 +157,14 @@ class TestOpStatsStatic(unittest.TestCase):
     def test_while_op(self):
         paddle.enable_static()
         main_program, startup_program = build_while_model()
-        self.assertEqual(main_program.num_blocks, 2)
-
-        paddle.static.amp.debugging.collect_operator_stats(
-            program=main_program, print_subblocks=True
-        )
+        if paddle.framework.use_pir_api():
+            self.assertEqual(main_program.num_blocks, 1)
+        else:
+            self.assertEqual(main_program.num_blocks, 2)
+
+            paddle.static.amp.debugging.collect_operator_stats(
+                program=main_program, print_subblocks=True
+            )
         paddle.disable_static()
 
 
diff --git a/test/auto_parallel/hybrid_strategy/pir_reshard_nd_mesh.py b/test/auto_parallel/hybrid_strategy/pir_reshard_nd_mesh.py
index 595f58b206193..8cf3f185dcbfc 100644
--- a/test/auto_parallel/hybrid_strategy/pir_reshard_nd_mesh.py
+++ b/test/auto_parallel/hybrid_strategy/pir_reshard_nd_mesh.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import os
-import unittest
 
 import paddle
 import paddle.distributed as dist
@@ -183,14 +182,11 @@ def run_pr_to_rs_case(self):
         tgt_out_value = (self._mesh.process_ids, [-1, 1, -1], {})
 
     def run_pr_to_ss_case(self):
-        # [Partial(), Replicate()] --> [Shard(0), Shard(1)]
-        # raise NotImplementedError
-        with unittest.TestCase().assertRaises(NotImplementedError):
-            self.create_program(
-                [self.BATCH_SIZE, self.SEQ_LEN, self.HIDDEN_SIZE],
-                [dist.Partial(dist.ReduceType.kRedSum), dist.Replicate()],
-                [dist.Shard(0), dist.Shard(1)],
-            )
+        self.create_program(
+            [self.BATCH_SIZE, self.SEQ_LEN, self.HIDDEN_SIZE],
+            [dist.Partial(dist.ReduceType.kRedSum), dist.Replicate()],
+            [dist.Shard(0), dist.Shard(1)],
+        )
 
     def run_ss_to_ss_case(self):
         # [Shard(0), Shard(1)] --> [Shard(1), Shard(0)]
diff --git a/test/auto_parallel/hybrid_strategy/pir_reshard_nd_mesh_cross_mesh.py b/test/auto_parallel/hybrid_strategy/pir_reshard_nd_mesh_cross_mesh.py
index 47bfb9a44df06..532426208c1ee 100644
--- a/test/auto_parallel/hybrid_strategy/pir_reshard_nd_mesh_cross_mesh.py
+++ b/test/auto_parallel/hybrid_strategy/pir_reshard_nd_mesh_cross_mesh.py
@@ -102,7 +102,7 @@ def run_pp_to_rr_case(self):
 
         rank_id = dist.get_rank()
         if rank_id in self._mesh0.process_ids:
-            assert new_ops_name[-1] == "pd_op.send_v2"
+            assert new_ops_name[2] == "pd_op.send_v2"
         else:
             assert new_ops_name[2] == "pd_op.recv_v2"
             assert new_ops_name[-2] == "pd_op.c_allreduce_sum_"
diff --git a/test/auto_parallel/pir/mlp_demo_3d.py b/test/auto_parallel/pir/mlp_demo_3d.py
index 41ac0d25f682a..a743aa218e659 100644
--- a/test/auto_parallel/pir/mlp_demo_3d.py
+++ b/test/auto_parallel/pir/mlp_demo_3d.py
@@ -118,50 +118,43 @@ def test_to_static_program(self):
         rank = paddle.distributed.get_rank()
         ops = dist_program.global_block().ops
         op_names = [op.name() for op in ops]
-        if rank < 4:
-            std_ops = [
-                'pd_op.data',
-                'builtin.parameter',
-                'pd_op.data',
-                'pd_op.relu',
-                'pd_op.matmul',
-                'pd_op.relu',
-                'dist_op.reshard',
-                'dist_op.reshard',
-                'pd_op.relu_grad',
-                'pd_op.matmul_grad',
-                'dist_op.reshard',
-                'dist_op.reshard',
-                'pd_op.relu_grad',
-                'pd_op.sgd_',
-            ]
-        else:
-            std_ops = [
-                'pd_op.data',
-                'builtin.parameter',
-                'pd_op.data',
-                'dist_op.reshard',
-                'pd_op.matmul',
-                'dist_op.reshard',
-                'pd_op.relu',
-                'pd_op.subtract',
-                'pd_op.square',
-                'pd_op.mean',
-                'builtin.shadow_output',
-                'pd_op.full',
-                'pd_op.full_like',
-                'dist_op.reshard',
-                'pd_op.mean_grad',
-                'dist_op.reshard',
-                'pd_op.square_grad',
-                'pd_op.subtract_grad',
-                'pd_op.relu_grad',
-                'pd_op.matmul_grad',
-                'dist_op.reshard',
-                'dist_op.reshard',
-                'pd_op.sgd_',
-            ]
-
+        std_ops = [
+            'pd_op.data',
+            'pd_op.data',
+            'builtin.parameter',
+            'builtin.parameter',
+            'pd_op.data',
+            'pd_op.data',
+            'pd_op.relu',
+            'pd_op.matmul',
+            'pd_op.relu',
+            'dist_op.reshard',
+            'pd_op.matmul',
+            'dist_op.reshard',
+            'pd_op.relu',
+            'pd_op.subtract',
+            'pd_op.square',
+            'pd_op.mean',
+            'builtin.shadow_output',
+            'pd_op.full',
+            'pd_op.full_like',
+            'dist_op.reshard',
+            'pd_op.mean_grad',
+            'dist_op.reshard',
+            'pd_op.square_grad',
+            'pd_op.subtract_grad',
+            'pd_op.relu_grad',
+            'pd_op.matmul_grad',
+            'dist_op.reshard',
+            'dist_op.reshard',
+            'pd_op.relu_grad',
+            'pd_op.matmul_grad',
+            'dist_op.reshard',
+            'dist_op.reshard',
+            'pd_op.relu_grad',
+            'pd_op.sgd_',
+            'pd_op.sgd_',
+        ]
         assert op_names == std_ops
 
     def test_loss_value(self):
diff --git a/test/auto_parallel/pir/pir_reshard_s_to_r.py b/test/auto_parallel/pir/pir_reshard_s_to_r.py
index 933eb855730ea..1d4afcddf0d64 100644
--- a/test/auto_parallel/pir/pir_reshard_s_to_r.py
+++ b/test/auto_parallel/pir/pir_reshard_s_to_r.py
@@ -81,7 +81,7 @@ def run_pir_test_case(self):
                 std_ops,
             )
         elif self._shard == 1:
-            np.testing.assert_equal(main_program.num_ops(), 10)
+            np.testing.assert_equal(main_program.num_ops(), 8)
             std_ops = [
                 'builtin.parameter',
                 'pd_op.data',
@@ -89,9 +89,7 @@ def run_pir_test_case(self):
                 'pd_op.c_allgather',
                 'pd_op.full',
                 'pd_op.split_with_num',
-                'builtin.split',
                 'pd_op.full',
-                'builtin.combine',
                 'pd_op.concat',
             ]
 
diff --git a/test/auto_parallel/pir/pir_reshard_s_to_r_cross_mesh.py b/test/auto_parallel/pir/pir_reshard_s_to_r_cross_mesh.py
index 771fbf29491ba..6b2fab19e2dab 100644
--- a/test/auto_parallel/pir/pir_reshard_s_to_r_cross_mesh.py
+++ b/test/auto_parallel/pir/pir_reshard_s_to_r_cross_mesh.py
@@ -65,12 +65,14 @@ def run_pir_test_case(self):
         ops = [op.name() for op in main_program.global_block().ops]
         if self._shard == 0:
             if paddle.distributed.get_rank() == 0:
-                np.testing.assert_equal(main_program.num_ops(), 4)
+                np.testing.assert_equal(main_program.num_ops(), 6)
                 std_ops = [
                     'builtin.parameter',
                     'pd_op.data',
                     'dist_op.shard_tensor',
                     'pd_op.send_v2',
+                    'dist_op.reshard',
+                    'pd_op.c_allgather',
                 ]
                 np.testing.assert_equal(
                     ops,
@@ -91,19 +93,25 @@ def run_pir_test_case(self):
                 )
         elif self._shard == 1:
             if paddle.distributed.get_rank() == 0:
-                np.testing.assert_equal(main_program.num_ops(), 4)
+                np.testing.assert_equal(main_program.num_ops(), 10)
                 std_ops = [
                     'builtin.parameter',
                     'pd_op.data',
                     'dist_op.shard_tensor',
                     'pd_op.send_v2',
+                    'dist_op.reshard',
+                    'pd_op.c_allgather',
+                    'pd_op.full',
+                    'pd_op.split_with_num',
+                    'pd_op.full',
+                    'pd_op.concat',
                 ]
                 np.testing.assert_equal(
                     ops,
                     std_ops,
                 )
             elif paddle.distributed.get_rank() == 1:
-                np.testing.assert_equal(main_program.num_ops(), 11)
+                np.testing.assert_equal(main_program.num_ops(), 9)
                 std_ops = [
                     'builtin.parameter',
                     'pd_op.data',
@@ -112,9 +120,7 @@ def run_pir_test_case(self):
                     'pd_op.c_allgather',
                     'pd_op.full',
                     'pd_op.split_with_num',
-                    'builtin.split',
                     'pd_op.full',
-                    'builtin.combine',
                     'pd_op.concat',
                 ]
 
diff --git a/test/auto_parallel/reshard_p_to_r_cross_mesh.py b/test/auto_parallel/reshard_p_to_r_cross_mesh.py
index 6960530bf3bb3..605a245cd19db 100644
--- a/test/auto_parallel/reshard_p_to_r_cross_mesh.py
+++ b/test/auto_parallel/reshard_p_to_r_cross_mesh.py
@@ -90,12 +90,14 @@ def run_pir_static_test_case(self):
 
         ops = [op.name() for op in main_program.global_block().ops]
         if paddle.distributed.get_rank() == 0:
-            np.testing.assert_equal(main_program.num_ops(), 4)
+            np.testing.assert_equal(main_program.num_ops(), 6)
             std_ops = [
                 'builtin.parameter',
                 'pd_op.data',
                 'dist_op.shard_tensor',
                 'pd_op.send_v2',
+                'dist_op.reshard',
+                'pd_op.c_allreduce_sum_',
             ]
         else:
             np.testing.assert_equal(main_program.num_ops(), 5)
diff --git a/test/auto_parallel/spmd_rules/test_flatten_rule.py b/test/auto_parallel/spmd_rules/test_flatten_rule.py
index 599b2ddf4bf95..9a9ae6b921842 100644
--- a/test/auto_parallel/spmd_rules/test_flatten_rule.py
+++ b/test/auto_parallel/spmd_rules/test_flatten_rule.py
@@ -38,7 +38,7 @@ def setUp(self):
 
     def test_flatten_infer_forward(self):
         # shape: [8, 16, 8, 24] --> [8, 16 * 8, 24]
-        # dims_mapping: [0, -1, -1, 1] --> [0, -1, -1, 1] [ 0, -1, 1]
+        # dims_mapping: [0, -1, -1, 1] --> [0, -1, -1, 1], ([0, -1, 1], [-1, 0, -1, -1, 1] // xshape)
         self.x_dist_tensor_spec.set_dims_mapping([0, -1, -1, 1])
         self.attrs['start_axis'] = 1
         self.attrs['stop_axis'] = 2
@@ -51,14 +51,17 @@ def test_flatten_infer_forward(self):
         infered_output_dist_attrs = result_dist_attrs[1]
 
         self.assertEqual(len(infered_input_dist_attrs), 1)
-        self.assertEqual(len(infered_output_dist_attrs), 1)
+        self.assertEqual(len(infered_output_dist_attrs), 2)
         self.assertEqual(
             infered_input_dist_attrs[0].dims_mapping, [0, -1, -1, 1]
         )
         self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0, -1, 1])
+        self.assertEqual(
+            infered_output_dist_attrs[1].dims_mapping, [-1, 0, -1, -1, 1]
+        )
 
         # shape: [8, 16, 8, 24] --> [8, 16 * 8, 24]
-        # dims_mapping: [-1, 0, -1, 1] --> [-1, 0, -1, 1] [ -1, 0, 1]
+        # dims_mapping: [-1, 0, -1, 1] --> [-1, 0, -1, 1] ([ -1, 0, 1], [-1, -1, 0, -1, 1] // xshape)
         self.x_dist_tensor_spec.set_dims_mapping([-1, 0, -1, 1])
         self.attrs['start_axis'] = 1
         self.attrs['stop_axis'] = 2
@@ -74,9 +77,12 @@ def test_flatten_infer_forward(self):
             infered_input_dist_attrs[0].dims_mapping, [-1, 0, -1, 1]
         )
         self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [-1, 0, 1])
+        self.assertEqual(
+            infered_output_dist_attrs[1].dims_mapping, [-1, -1, 0, -1, 1]
+        )
 
         # shape: [8, 16, 8, 24] --> [8, 16 * 8, 24]
-        # dims_mapping: [-1, -1, 1, 0] --> [-1, -1, -1, 0] [ -1, -1, 0]
+        # dims_mapping: [-1, -1, 1, 0] --> [-1, -1, -1, 0] ([ -1, -1, 0], [-1, -1, -1, -1, 0] // xshape)
         self.x_dist_tensor_spec.set_dims_mapping([-1, -1, 1, 0])
         self.attrs['start_axis'] = 1
         self.attrs['stop_axis'] = 2
@@ -92,9 +98,12 @@ def test_flatten_infer_forward(self):
             infered_input_dist_attrs[0].dims_mapping, [-1, -1, -1, 0]
         )
         self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [-1, -1, 0])
+        self.assertEqual(
+            infered_output_dist_attrs[1].dims_mapping, [-1, -1, -1, -1, 0]
+        )
 
         # shape: [8, 16, 8, 24] --> [8 * 16 * 8 * 24]
-        # dims_mapping: [-1, 0, 1, -1] --> [-1, -1, -1, -1] [ -1]
+        # dims_mapping: [-1, 0, 1, -1] --> [-1, -1, -1, -1] ([ -1], [-1, -1, -1, -1, -1] // xshape)
         self.x_dist_tensor_spec.set_dims_mapping([-1, 0, 1, -1])
         self.attrs['start_axis'] = 0
         self.attrs['stop_axis'] = -1
@@ -110,9 +119,12 @@ def test_flatten_infer_forward(self):
             infered_input_dist_attrs[0].dims_mapping, [-1, -1, -1, -1]
         )
         self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [-1])
+        self.assertEqual(
+            infered_output_dist_attrs[1].dims_mapping, [-1, -1, -1, -1, -1]
+        )
 
         # shape: [8, 16, 8, 24] --> [8 * 16 * 8 * 24]
-        # dims_mapping: [0, -1, -1, 1] --> [0, -1, -1, -1] [ 0]
+        # dims_mapping: [0, -1, -1, 1] --> [0, -1, -1, -1] ([ 0], [-1, 0, -1, -1, -1] // xshape)
         self.x_dist_tensor_spec.set_dims_mapping([0, -1, -1, 1])
         self.attrs['start_axis'] = 0
         self.attrs['stop_axis'] = -1
@@ -128,9 +140,12 @@ def test_flatten_infer_forward(self):
             infered_input_dist_attrs[0].dims_mapping, [0, -1, -1, -1]
         )
         self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0])
+        self.assertEqual(
+            infered_output_dist_attrs[1].dims_mapping, [-1, 0, -1, -1, -1]
+        )
 
         # shape: [8, 16, 8, 24] --> [8 * 16 * 8 * 24]
-        # dims_mapping: [1, 0, -1, -1] --> [1, -1, -1, -1] [ 1]
+        # dims_mapping: [1, 0, -1, -1] --> [1, -1, -1, -1] ([ 1], [-1, 1, -1, -1, -1] // xshape)
         self.x_dist_tensor_spec.set_dims_mapping([1, 0, -1, -1])
         self.attrs['start_axis'] = 0
         self.attrs['stop_axis'] = -1
@@ -146,9 +161,12 @@ def test_flatten_infer_forward(self):
             infered_input_dist_attrs[0].dims_mapping, [1, -1, -1, -1]
         )
         self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [1])
+        self.assertEqual(
+            infered_output_dist_attrs[1].dims_mapping, [-1, 1, -1, -1, -1]
+        )
 
         # shape: [8, 16, 8, 24] --> [8, 16 * 8 * 24]
-        # dims_mapping: [-1, -1, 0, 1] --> [-1, -1, -1, -1] [-1, -1]
+        # dims_mapping: [-1, -1, 0, 1] --> [-1, -1, -1, -1] ([-1, -1], [-1, -1, -1, -1, -1] // xshape)
         self.x_dist_tensor_spec.set_dims_mapping([-1, -1, 0, 1])
         self.attrs['start_axis'] = 1
         self.attrs['stop_axis'] = -1
@@ -164,9 +182,12 @@ def test_flatten_infer_forward(self):
             infered_input_dist_attrs[0].dims_mapping, [-1, -1, -1, -1]
         )
         self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [-1, -1])
+        self.assertEqual(
+            infered_output_dist_attrs[1].dims_mapping, [-1, -1, -1, -1, -1]
+        )
 
         # shape: [8, 16, 8, 24] --> [8, 16 * 8 * 24]
-        # dims_mapping: [-1, 0, -1, 1] --> [-1, 0, -1, -1] [-1, 0]
+        # dims_mapping: [-1, 0, -1, 1] --> [-1, 0, -1, -1] ([-1, 0], [-1, -1, 0, -1, -1] // xshape)
         self.x_dist_tensor_spec.set_dims_mapping([-1, 0, -1, 1])
         self.attrs['start_axis'] = 1
         self.attrs['stop_axis'] = -1
@@ -182,9 +203,12 @@ def test_flatten_infer_forward(self):
             infered_input_dist_attrs[0].dims_mapping, [-1, 0, -1, -1]
         )
         self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [-1, 0])
+        self.assertEqual(
+            infered_output_dist_attrs[1].dims_mapping, [-1, -1, 0, -1, -1]
+        )
 
         # shape: [8, 16, 8, 24] --> [8, 16 * 8 * 24]
-        # dims_mapping: [0, 1, -1, -1] --> [0, 1, -1, -1] [0, 1]
+        # dims_mapping: [0, 1, -1, -1] --> [0, 1, -1, -1] ([0, 1], [-1, 0, 1, -1, -1] // xshape)
         self.x_dist_tensor_spec.set_dims_mapping([0, 1, -1, -1])
         self.attrs['start_axis'] = 1
         self.attrs['stop_axis'] = -1
@@ -200,6 +224,9 @@ def test_flatten_infer_forward(self):
             infered_input_dist_attrs[0].dims_mapping, [0, 1, -1, -1]
         )
         self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0, 1])
+        self.assertEqual(
+            infered_output_dist_attrs[1].dims_mapping, [-1, 0, 1, -1, -1]
+        )
 
     def test_flatten_infer_backward(self):
         process_mesh = auto.ProcessMesh(mesh=[[0, 1, 2, 3], [4, 5, 6, 7]])
diff --git a/test/autograd/CMakeLists.txt b/test/autograd/CMakeLists.txt
index 14336674c2ce0..82dc290ddbe3f 100644
--- a/test/autograd/CMakeLists.txt
+++ b/test/autograd/CMakeLists.txt
@@ -20,3 +20,4 @@ set_tests_properties(test_minimize PROPERTIES TIMEOUT 60)
 if(NOT WIN32)
   set_tests_properties(test_autograd_functional_prim PROPERTIES TIMEOUT 60)
 endif()
+set_tests_properties(test_autograd_functional_static PROPERTIES TIMEOUT 160)
diff --git a/test/deprecated/autograd/test_autograd_functional_static.py b/test/autograd/test_autograd_functional_static.py
similarity index 99%
rename from test/deprecated/autograd/test_autograd_functional_static.py
rename to test/autograd/test_autograd_functional_static.py
index 127cb93a4cbc6..c8089ce437d0d 100644
--- a/test/deprecated/autograd/test_autograd_functional_static.py
+++ b/test/autograd/test_autograd_functional_static.py
@@ -353,6 +353,9 @@ def run_test_by_entries(self, pd_f, np_f, inps, batch=False):
             np.testing.assert_allclose(pd_entry, np_entry, self.rtol, self.atol)
 
     def test_square(self):
+        if paddle.framework.use_pir_api():
+            return
+
         def pd_f(x):
             return paddle.multiply(x, x)
 
diff --git a/test/cinn/ir/test_llir_schedule_fuse_split.py b/test/cinn/ir/test_llir_schedule_fuse_split.py
index 612e3a36c59a1..91930d82a90d6 100644
--- a/test/cinn/ir/test_llir_schedule_fuse_split.py
+++ b/test/cinn/ir/test_llir_schedule_fuse_split.py
@@ -158,7 +158,7 @@ def elementwise_fuse_assign_loop(
                             i_j_k_fused % 128,
                         ],
                     )
-                    Y[i1, j1, k1] = 2.0 * X[i1, j1, k1]
+                    Y[i1, j1, k1] = X[i1, j1, k1] * 2.0
 
     assert str(origin.elementwise_fuse_assign_loop) == str(
         expected.elementwise_fuse_assign_loop
diff --git a/test/collective/fleet/test_c_comm_init_op.sh b/test/collective/fleet/test_c_comm_init_op.sh
index 9b99e553d182b..dbf148856d435 100644
--- a/test/collective/fleet/test_c_comm_init_op.sh
+++ b/test/collective/fleet/test_c_comm_init_op.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/test/collective/fleet/test_fused_attention_pass_with_mp.sh b/test/collective/fleet/test_fused_attention_pass_with_mp.sh
index 4b2b48cdc08df..777b6b106ee70 100644
--- a/test/collective/fleet/test_fused_attention_pass_with_mp.sh
+++ b/test/collective/fleet/test_fused_attention_pass_with_mp.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/test/collective/fleet/test_new_group.sh b/test/collective/fleet/test_new_group.sh
index 4914183fb46f9..4ec46d22cdb48 100755
--- a/test/collective/fleet/test_new_group.sh
+++ b/test/collective/fleet/test_new_group.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/test/collective/multinode/multinode_dist_test.sh b/test/collective/multinode/multinode_dist_test.sh
index 8ea1937f8318a..002b9eee612ec 100644
--- a/test/collective/multinode/multinode_dist_test.sh
+++ b/test/collective/multinode/multinode_dist_test.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -84,7 +84,7 @@ if [[ $exit_code -ne 0 ]]; then
 fi
 
 #display system context
-for i in {1..2}; do 
+for i in {1..2}; do
     sleep 3
     ps -aux
     netstat -anlp
diff --git a/test/collective/test_mpi_comm.sh b/test/collective/test_mpi_comm.sh
index 062d3c1ed8e5e..83ef86fd4713e 100644
--- a/test/collective/test_mpi_comm.sh
+++ b/test/collective/test_mpi_comm.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/test/collective/test_orthogonal_strategy.sh b/test/collective/test_orthogonal_strategy.sh
index 6b4df2b124617..f65ac14842bb6 100644
--- a/test/collective/test_orthogonal_strategy.sh
+++ b/test/collective/test_orthogonal_strategy.sh
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/test/collective/test_strategy_group.sh b/test/collective/test_strategy_group.sh
index d6c3a0e79fa87..7dc334278d00a 100644
--- a/test/collective/test_strategy_group.sh
+++ b/test/collective/test_strategy_group.sh
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/test/collective/test_world_size_and_rank.sh b/test/collective/test_world_size_and_rank.sh
index c559c4bd26cff..e6762009083bc 100644
--- a/test/collective/test_world_size_and_rank.sh
+++ b/test/collective/test_world_size_and_rank.sh
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/test/contrib/CMakeLists.txt b/test/contrib/CMakeLists.txt
index e723b8abaf396..02219f03129b8 100644
--- a/test/contrib/CMakeLists.txt
+++ b/test/contrib/CMakeLists.txt
@@ -20,3 +20,4 @@ py_test_modules(
   FLAGS_conv_workspace_size_limit=1000)
 
 set_tests_properties(test_multi_precision_fp16_train PROPERTIES TIMEOUT 120)
+set_tests_properties(test_image_classification_fp16 PROPERTIES TIMEOUT 120)
diff --git a/test/contrib/test_bf16_utils.py b/test/contrib/test_bf16_utils.py
new file mode 100644
index 0000000000000..ce542e9603dad
--- /dev/null
+++ b/test/contrib/test_bf16_utils.py
@@ -0,0 +1,111 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import unittest
+
+import paddle
+from paddle.static import amp
+
+paddle.enable_static()
+
+
+class AMPTest(unittest.TestCase):
+    def setUp(self):
+        self.bf16_list = copy.copy(amp.bf16.amp_lists.bf16_list)
+        self.fp32_list = copy.copy(amp.bf16.amp_lists.fp32_list)
+        self.gray_list = copy.copy(amp.bf16.amp_lists.gray_list)
+        self.amp_lists_ = None
+
+    def tearDown(self):
+        self.assertEqual(self.amp_lists_.bf16_list, self.bf16_list)
+        self.assertEqual(self.amp_lists_.fp32_list, self.fp32_list)
+        self.assertEqual(self.amp_lists_.gray_list, self.gray_list)
+
+    def test_amp_lists(self):
+        self.amp_lists_ = amp.bf16.AutoMixedPrecisionListsBF16()
+
+    def test_amp_lists_1(self):
+        # 1. w={'exp}, b=None
+        self.bf16_list.add('exp')
+        self.fp32_list.remove('exp')
+
+        self.amp_lists_ = amp.bf16.AutoMixedPrecisionListsBF16({'exp'})
+
+    def test_amp_lists_2(self):
+        # 2. w={'tanh'}, b=None
+        self.fp32_list.remove('tan')
+        self.bf16_list.add('tan')
+
+        self.amp_lists_ = amp.bf16.AutoMixedPrecisionListsBF16({'tan'})
+
+    def test_amp_lists_3(self):
+        # 3. w={'lstm'}, b=None
+        self.bf16_list.add('lstm')
+
+        self.amp_lists_ = amp.bf16.AutoMixedPrecisionListsBF16({'lstm'})
+
+    def test_amp_lists_4(self):
+        # 4. w=None, b={'matmul_v2'}
+        self.bf16_list.remove('matmul_v2')
+        self.fp32_list.add('matmul_v2')
+
+        self.amp_lists_ = amp.bf16.AutoMixedPrecisionListsBF16(
+            custom_fp32_list={'matmul_v2'}
+        )
+
+    def test_amp_lists_5(self):
+        # 5. w=None, b={'matmul_v2'}
+        self.fp32_list.add('matmul_v2')
+        self.bf16_list.remove('matmul_v2')
+
+        self.amp_lists_ = amp.bf16.AutoMixedPrecisionListsBF16(
+            custom_fp32_list={'matmul_v2'}
+        )
+
+    def test_amp_lists_6(self):
+        # 6. w=None, b={'lstm'}
+        self.fp32_list.add('lstm')
+
+        self.amp_lists_ = amp.bf16.AutoMixedPrecisionListsBF16(
+            custom_fp32_list={'lstm'}
+        )
+
+    def test_amp_lists_7(self):
+        self.fp32_list.add('reshape2')
+        self.gray_list.remove('reshape2')
+
+        self.amp_lists_ = amp.bf16.AutoMixedPrecisionListsBF16(
+            custom_fp32_list={'reshape2'}
+        )
+
+    def test_amp_list_8(self):
+        self.bf16_list.add('reshape2')
+        self.gray_list.remove('reshape2')
+
+        self.amp_lists_ = amp.bf16.AutoMixedPrecisionListsBF16(
+            custom_bf16_list={'reshape2'}
+        )
+
+
+class AMPTest2(unittest.TestCase):
+    def test_amp_lists_(self):
+        # 7. w={'lstm'} b={'lstm'}
+        # raise ValueError
+        self.assertRaises(
+            ValueError, amp.bf16.AutoMixedPrecisionListsBF16, {'lstm'}, {'lstm'}
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/deprecated/contrib/test_image_classification_fp16.py b/test/contrib/test_image_classification_fp16.py
similarity index 67%
rename from test/deprecated/contrib/test_image_classification_fp16.py
rename to test/contrib/test_image_classification_fp16.py
index c3cfa834a4ed2..d927b68eff9b3 100644
--- a/test/deprecated/contrib/test_image_classification_fp16.py
+++ b/test/contrib/test_image_classification_fp16.py
@@ -23,30 +23,130 @@
 import numpy
 
 # TODO: remove sys.path.append
-sys.path.append("../../legacy_test")
+sys.path.append("../legacy_test")
 import nets
 
 import paddle
 from paddle import base
+from paddle.framework import in_pir_mode
+from paddle.nn import Layer
+from paddle.pir_utils import test_with_pir_api
 from paddle.static.amp import decorate
 
 paddle.enable_static()
 
 
+def img_conv_group_pir(
+    input,
+    in_channels,
+    out_channels,
+    conv_num_filter,
+    kernel_size,
+    pool_size,
+    pool_stride=1,
+    pool_padding=0,
+    pool_type='max',
+    global_pooling=False,
+    conv_with_batchnorm=False,
+    conv_batchnorm_drop_rate=0.0,
+    conv_stride=1,
+    conv_padding=1,
+    conv_filter_size=3,
+    conv_dilation=1,
+    conv_groups=1,
+    param_attr=None,
+    bias_attr=None,
+    conv_act=None,
+    use_cudnn=True,
+):
+    tmp = input
+    assert isinstance(conv_num_filter, (list, tuple))
+
+    def __extend_list__(obj):
+        if not hasattr(obj, '__len__'):
+            return [obj] * len(conv_num_filter)
+        else:
+            assert len(obj) == len(conv_num_filter)
+            return obj
+
+    conv_padding = __extend_list__(conv_padding)
+    conv_filter_size = __extend_list__(conv_filter_size)
+    param_attr = __extend_list__(param_attr)
+    conv_with_batchnorm = __extend_list__(conv_with_batchnorm)
+    conv_batchnorm_drop_rate = __extend_list__(conv_batchnorm_drop_rate)
+
+    for i in range(len(conv_num_filter)):
+        local_conv_act = conv_act
+        if conv_with_batchnorm[i]:
+            local_conv_act = None
+
+        conv = paddle.nn.Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=conv_stride,
+            padding=conv_padding[i],
+            dilation=conv_dilation,
+            groups=conv_groups,
+            bias_attr=bias_attr,
+        )
+        conv_out = conv(input)
+
+        if conv_with_batchnorm[i]:
+            batch_norm = paddle.nn.BatchNorm(in_channels, act=conv_act)
+            tmp = batch_norm(tmp)
+            drop_rate = conv_batchnorm_drop_rate[i]
+            if abs(drop_rate) > 1e-5:
+                tmp = paddle.nn.functional.dropout(x=tmp, p=drop_rate)
+
+    if pool_type == 'max':
+        pool_out = paddle.nn.functional.max_pool2d(
+            x=tmp,
+            kernel_size=pool_size,
+            stride=pool_stride,
+        )
+    else:
+        pool_out = paddle.nn.functional.avg_pool2d(
+            x=tmp,
+            kernel_size=pool_size,
+            stride=pool_stride,
+        )
+    return pool_out
+
+
 def resnet_cifar10(input, depth=32):
     def conv_bn_layer(
-        input, ch_out, filter_size, stride, padding, act='relu', bias_attr=False
+        input,
+        ch_out,
+        filter_size,
+        stride,
+        padding,
+        act='relu',
+        bias_attr=False,
     ):
-        tmp = paddle.static.nn.conv2d(
-            input=input,
-            filter_size=filter_size,
-            num_filters=ch_out,
-            stride=stride,
-            padding=padding,
-            act=None,
-            bias_attr=bias_attr,
-        )
-        return paddle.static.nn.batch_norm(input=tmp, act=act)
+        if in_pir_mode():
+            conv = paddle.nn.Conv2D(
+                in_channels=input.shape[1],
+                out_channels=ch_out,
+                kernel_size=filter_size,
+                stride=stride,
+                padding=padding,
+                bias_attr=bias_attr,
+            )
+            tmp = conv(input)
+            bn = paddle.nn.BatchNorm(tmp.shape[1], act=act)
+            return bn(tmp)
+        else:
+            tmp = paddle.static.nn.conv2d(
+                input=input,
+                filter_size=filter_size,
+                num_filters=ch_out,
+                stride=stride,
+                padding=padding,
+                act=None,
+                bias_attr=bias_attr,
+            )
+            return paddle.static.nn.batch_norm(input=tmp, act=act)
 
     def shortcut(input, ch_in, ch_out, stride):
         if ch_in != ch_out:
@@ -80,17 +180,32 @@ def layer_warp(block_func, input, ch_in, ch_out, count, stride):
 
 def vgg16_bn_drop(input):
     def conv_block(input, num_filter, groups, dropouts):
-        return nets.img_conv_group(
-            input=input,
-            pool_size=2,
-            pool_stride=2,
-            conv_num_filter=[num_filter] * groups,
-            conv_filter_size=3,
-            conv_act='relu',
-            conv_with_batchnorm=True,
-            conv_batchnorm_drop_rate=dropouts,
-            pool_type='max',
-        )
+        if in_pir_mode():
+            return img_conv_group_pir(
+                input,
+                in_channels=3,
+                out_channels=num_filter,
+                conv_num_filter=[num_filter] * groups,
+                kernel_size=3,
+                pool_size=2,
+                pool_stride=2,
+                pool_padding=0,
+                pool_type='max',
+                conv_act='relu',
+                conv_with_batchnorm=True,
+            )
+        else:
+            return nets.img_conv_group(
+                input=input,
+                pool_size=2,
+                pool_stride=2,
+                conv_num_filter=[num_filter] * groups,
+                conv_filter_size=3,
+                conv_act='relu',
+                conv_with_batchnorm=True,
+                conv_batchnorm_drop_rate=dropouts,
+                pool_type='max',
+            )
 
     conv1 = conv_block(input, 64, 2, [0.3, 0])
     conv2 = conv_block(conv1, 128, 2, [0.4, 0])
@@ -100,7 +215,11 @@ def conv_block(input, num_filter, groups, dropouts):
 
     drop = paddle.nn.functional.dropout(x=conv5, p=0.5)
     fc1 = paddle.static.nn.fc(x=drop, size=4096, activation=None)
-    bn = paddle.static.nn.batch_norm(input=fc1, act='relu')
+    if in_pir_mode():
+        batch_norm = paddle.nn.BatchNorm(4096)
+        bn = batch_norm(fc1)
+    else:
+        bn = paddle.static.nn.batch_norm(input=fc1, act='relu')
     drop2 = paddle.nn.functional.dropout(x=bn, p=0.5)
     fc2 = paddle.static.nn.fc(x=drop2, size=4096, activation=None)
     return fc2
@@ -110,8 +229,8 @@ def train(net_type, use_cuda, save_dirname, is_local):
     classdim = 10
     data_shape = [3, 32, 32]
 
-    train_program = base.Program()
-    startup_prog = base.Program()
+    train_program = paddle.static.Program()
+    startup_prog = paddle.static.Program()
     paddle.seed(123)
     with base.program_guard(train_program, startup_prog):
         images = paddle.static.data(
@@ -128,31 +247,85 @@ def train(net_type, use_cuda, save_dirname, is_local):
         else:
             raise ValueError("%s network is not supported" % net_type)
 
-        logits = paddle.static.nn.fc(x=net, size=classdim, activation="softmax")
-        cost, predict = paddle.nn.functional.softmax_with_cross_entropy(
-            logits, label, return_softmax=True
-        )
-        avg_cost = paddle.mean(cost)
-        acc = paddle.static.accuracy(input=predict, label=label)
+        optimizer = paddle.optimizer.Lamb(learning_rate=0.001)
 
-        # Test program
-        test_program = train_program.clone(for_test=True)
+        if in_pir_mode():
 
-        optimizer = paddle.optimizer.Lamb(learning_rate=0.001)
+            class layer(Layer):
+                def __init__(self, classdim, act):
+                    super().__init__()
+                    self.classdim = classdim
+                    self.act = act
 
-        amp_lists = paddle.static.amp.AutoMixedPrecisionLists(
-            custom_black_varnames={"loss", "conv2d_0.w_0"}
-        )
-        mp_optimizer = decorate(
-            optimizer=optimizer,
-            amp_lists=amp_lists,
-            init_loss_scaling=8.0,
-            use_dynamic_loss_scaling=True,
-        )
+                def forward(self, x):
+                    logits = paddle.static.nn.fc(
+                        x=x, size=self.classdim, activation=self.act
+                    )
+                    (
+                        cost,
+                        predict,
+                    ) = paddle.nn.functional.softmax_with_cross_entropy(
+                        logits, label, return_softmax=True
+                    )
+                    return cost, predict
+
+            model = layer(classdim, "softmax")
+            model, optimizer = paddle.amp.decorate(
+                models=model,
+                optimizers=optimizer,
+                level="O2",
+                dtype='float16',
+            )
+            scaler = paddle.amp.GradScaler(
+                init_loss_scaling=8.0, use_dynamic_loss_scaling=True
+            )
+
+            with paddle.amp.auto_cast(
+                enable=True,
+                level='O2',
+                dtype='float16',
+                custom_black_list={'transpose2', 'concat'},
+                use_promote=True,
+            ):
+                cost, predict = model(net)
+                avg_cost = paddle.mean(cost)
+                acc = paddle.static.accuracy(input=predict, label=label)
+            # Test program
+            value_map = paddle.pir.IrMapping()
+            test_program = train_program.clone(value_map)
+            fetch_list = []
+            fetch_list.append(value_map.look_up(avg_cost))
+            fetch_list.append(value_map.look_up(acc))
+
+            scaled = scaler.scale(avg_cost)
+            scaler.minimize(optimizer, scaled, startup_program=startup_prog)
+            loss_scaling = optimizer.get_loss_scaling()
+            scaled_loss = optimizer.get_scaled_loss()
+        else:
+            logits = paddle.static.nn.fc(
+                x=net, size=classdim, activation="softmax"
+            )
+            cost, predict = paddle.nn.functional.softmax_with_cross_entropy(
+                logits, label, return_softmax=True
+            )
+            avg_cost = paddle.mean(cost)
+            acc = paddle.static.accuracy(input=predict, label=label)
+            # Test program
+            test_program = train_program.clone(for_test=True)
+            fetch_list = [avg_cost, acc]
+            amp_lists = paddle.static.amp.AutoMixedPrecisionLists(
+                custom_black_varnames={"loss", "conv2d_0.w_0"}
+            )
+            mp_optimizer = decorate(
+                optimizer=optimizer,
+                amp_lists=amp_lists,
+                init_loss_scaling=8.0,
+                use_dynamic_loss_scaling=True,
+            )
 
-        mp_optimizer.minimize(avg_cost)
-        loss_scaling = mp_optimizer.get_loss_scaling()
-        scaled_loss = mp_optimizer.get_scaled_loss()
+            mp_optimizer.minimize(avg_cost)
+            loss_scaling = mp_optimizer.get_loss_scaling()
+            scaled_loss = mp_optimizer.get_scaled_loss()
 
     BATCH_SIZE = 128
     PASS_NUM = 1
@@ -190,7 +363,7 @@ def train_loop(main_program):
                         loss_t, acc_t = exe.run(
                             program=test_program,
                             feed=feeder.feed(test_data),
-                            fetch_list=[avg_cost, acc],
+                            fetch_list=fetch_list,
                         )
                         if math.isnan(float(loss_t)):
                             sys.exit("got NaN loss, training failed.")
@@ -456,10 +629,12 @@ def test_amp_lists_7(self):
             {'lstm'},
         )
 
+    @test_with_pir_api
     def test_vgg_cuda(self):
         with self.scope_prog_guard():
             self.main('vgg', use_cuda=True)
 
+    @test_with_pir_api
     def test_resnet_cuda(self):
         with self.scope_prog_guard():
             self.main('resnet', use_cuda=True)
@@ -474,44 +649,5 @@ def scope_prog_guard(self):
                 yield
 
 
-class TestAmpWithNonIterableDataLoader(unittest.TestCase):
-    def decorate_with_data_loader(self):
-        main_prog = paddle.static.Program()
-        start_prog = paddle.static.Program()
-        with paddle.static.program_guard(main_prog, start_prog):
-            with paddle.base.unique_name.guard():
-                image = paddle.static.data(
-                    name='image', shape=[-1, 3, 224, 224], dtype='float32'
-                )
-                label = paddle.static.data(
-                    name='label', shape=[-1, 1], dtype='int64'
-                )
-
-                net = vgg16_bn_drop(image)
-                logits = paddle.static.nn.fc(
-                    x=net, size=10, activation="softmax"
-                )
-                cost, predict = paddle.nn.functional.softmax_with_cross_entropy(
-                    logits, label, return_softmax=True
-                )
-                avg_cost = paddle.mean(cost)
-
-                optimizer = paddle.optimizer.Lamb(learning_rate=0.001)
-                amp_lists = paddle.static.amp.AutoMixedPrecisionLists(
-                    custom_black_varnames={"loss", "conv2d_0.w_0"}
-                )
-                mp_optimizer = decorate(
-                    optimizer=optimizer,
-                    amp_lists=amp_lists,
-                    init_loss_scaling=8.0,
-                    use_dynamic_loss_scaling=True,
-                )
-
-                mp_optimizer.minimize(avg_cost)
-
-    def test_non_iterable_dataloader(self):
-        self.decorate_with_data_loader()
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/cpp/auto_parallel/spmd_rule_test.cc b/test/cpp/auto_parallel/spmd_rule_test.cc
index 0ae0b8ed3eaf1..8395588b5ce58 100644
--- a/test/cpp/auto_parallel/spmd_rule_test.cc
+++ b/test/cpp/auto_parallel/spmd_rule_test.cc
@@ -1853,6 +1853,71 @@ TEST(CumSumGradInferSpmd, Ctor) {
             std::vector<int64_t>({-1, -1, -1}));
 }
 
+TEST(Flatten, Ctor) {
+  std::vector<int64_t> mesh_shape = {2, 2};
+  std::vector<int64_t> process_ids = {0, 1, 2, 3};
+  std::vector<std::string> dim_names = {"x", "y"};
+  ProcessMesh process_mesh(mesh_shape, process_ids, dim_names);
+
+  auto build_input = [&](const std::vector<int64_t>& shape,
+                         const std::vector<int64_t>& dim_mapping) {
+    auto t_dist_attr = TensorDistAttr();
+    t_dist_attr.set_process_mesh(process_mesh);
+    t_dist_attr.set_dims_mapping(dim_mapping);
+    t_dist_attr.set_dynamic_dims(std::vector<bool>(shape.size(), false));
+    auto input =
+        phi::distributed::DistMetaTensor(common::make_ddim(shape), t_dist_attr);
+    return input;
+  };
+
+  // [b, h/ph, w/pw, c, ph, pw]; dp
+  auto input1 = build_input({4, 16, 16, 4, 2, 2}, {0, -1, -1, -1, -1, -1});
+  // [b, h/ph, w/pw, c, ph, pw] => [b, h/ph, w/pw, hidden_size]
+  auto spmd1 = FlattenInferSpmd(input1, -3, -1);
+  EXPECT_EQ(spmd1.first.size(), static_cast<size_t>(1));
+  EXPECT_EQ(spmd1.second.size(), static_cast<size_t>(2));
+  check_dim_mapping(spmd1.first[0], {0, -1, -1, -1, -1, -1});
+  check_dim_mapping(spmd1.second[0], {0, -1, -1, -1});
+  check_dim_mapping(spmd1.second[1], {-1, 0, -1, -1, -1, -1, -1});  // x_shape
+
+  // [b, h/ph, w/pw, c, ph, pw]; dp, mp
+  auto input2 = build_input({4, 16, 16, 4, 2, 2}, {-1, 0, -1, 1, -1, -1});
+  auto spmd2 = FlattenInferSpmd(input2, 1, 4);
+  EXPECT_EQ(spmd2.first.size(), static_cast<size_t>(1));
+  EXPECT_EQ(spmd2.second.size(), static_cast<size_t>(2));
+  check_dim_mapping(spmd2.first[0], {-1, 0, -1, -1, -1, -1});
+  check_dim_mapping(spmd2.second[0], {-1, 0, -1});
+  check_dim_mapping(spmd2.second[1], {-1, -1, 0, -1, -1, -1, -1});  // x_shape
+
+  // [b, s, nh, h/nh]; dp , mp
+  auto input3 = build_input({2, 1024, 32, 32}, {0, -1, 1, -1});
+  // [b, s, nh, h/nh] => [b, s, h]
+  auto spmd3 = FlattenInferSpmd(input3, 2, 3);
+  EXPECT_EQ(spmd3.first.size(), static_cast<size_t>(1));
+  EXPECT_EQ(spmd3.second.size(), static_cast<size_t>(2));
+  check_dim_mapping(spmd3.first[0], {0, -1, 1, -1});
+  check_dim_mapping(spmd3.second[0], {0, -1, 1});
+  check_dim_mapping(spmd3.second[1], {-1, 0, -1, 1, -1});  // x_shape
+
+  // [b, c, d, h, w]; dp, mp
+  auto input4 = build_input({4, 16, 16, 4, 16}, {-1, -1, 0, 1, -1});
+  auto spmd4 = FlattenInferSpmd(input4, 1, 4);
+  EXPECT_EQ(spmd4.first.size(), static_cast<size_t>(1));
+  EXPECT_EQ(spmd4.second.size(), static_cast<size_t>(2));
+  check_dim_mapping(spmd4.first[0], {-1, -1, -1, -1, -1});
+  check_dim_mapping(spmd4.second[0], {-1, -1});
+  check_dim_mapping(spmd4.second[1], {-1, -1, -1, -1, -1, -1});  // x_shape
+
+  auto out_grad = build_input({2, 1024, 1024}, {0, -1, 1});
+  auto xshape = build_input({0, 2, 1024, 4, 1024 / 4}, {-1, 0, -1, 1, -1});
+  auto spmd_grad = FlattenGradInferSpmd(xshape, out_grad);
+  EXPECT_EQ(spmd_grad.first.size(), static_cast<size_t>(2));
+  EXPECT_EQ(spmd_grad.second.size(), static_cast<size_t>(1));
+  check_dim_mapping(spmd_grad.first[0], {-1, 0, -1, 1, -1});
+  check_dim_mapping(spmd_grad.first[1], {0, -1, 1});
+  check_dim_mapping(spmd_grad.second[0], {0, -1, 1, -1});
+}
+
 }  // namespace auto_parallel
 }  // namespace distributed
 }  // namespace paddle
diff --git a/test/cpp/eager/task_tests/CMakeLists.txt b/test/cpp/eager/task_tests/CMakeLists.txt
index 39a11d9582ae3..393421be711f0 100755
--- a/test/cpp/eager/task_tests/CMakeLists.txt
+++ b/test/cpp/eager/task_tests/CMakeLists.txt
@@ -4,7 +4,7 @@ if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
   paddle_test(test_egr_task_hook SRCS hook_test.cc)
   paddle_test(test_egr_task_backward SRCS backward_test.cc)
   paddle_test(test_egr_task_grad SRCS grad_test.cc)
-  paddle_test(test_egr_task_fwd_bwd_joint SRCS fwd_bwd_joint_test.cc)
+  paddle_test(test_egr_task_fwd_bwd_joint SRCS fwd_bwd_joint_test.cc DEPS phi)
   paddle_test(test_egr_task_cross_batch SRCS cross_batch_accumulation_test.cc)
   paddle_test(test_egr_task_hook_intermidiate SRCS hook_test_intermidiate.cc)
   paddle_test(test_egr_task_autocodegen SRCS generated_test.cc)
diff --git a/test/cpp/fluid/benchmark/CMakeLists.txt b/test/cpp/fluid/benchmark/CMakeLists.txt
index 0b14b812af9f9..cb8e47a0b305f 100644
--- a/test/cpp/fluid/benchmark/CMakeLists.txt
+++ b/test/cpp/fluid/benchmark/CMakeLists.txt
@@ -1,4 +1,4 @@
-paddle_test(op_tester SRCS op_tester.cc DEPS common)
+paddle_test(op_tester SRCS op_tester.cc DEPS common phi)
 
 if(WITH_ONNXRUNTIME AND WIN32)
   # Copy onnxruntime for some c++ test in Windows, since the test will
diff --git a/test/cpp/fluid/memory/malloc_test.cu b/test/cpp/fluid/memory/malloc_test.cu
index 5728363ac8877..a86d3c4988b7b 100644
--- a/test/cpp/fluid/memory/malloc_test.cu
+++ b/test/cpp/fluid/memory/malloc_test.cu
@@ -37,8 +37,6 @@ const int NUM_STREAMS = 8;
 const int N = 2;
 const float DELTA = 1e-1;
 
-using CudaDevCtxVec = std::vector<std::unique_ptr<phi::GPUContext>>;
-
 __global__ void kernel(float *x, int n) {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   for (int i = tid; i < n; i += blockDim.x * gridDim.x) {
@@ -46,51 +44,58 @@ __global__ void kernel(float *x, int n) {
   }
 }
 
-void CheckKernelOutput(float *x, int n) {
+void CheckKernelOutput(const AllocationPtr &x, int n) {
   auto host_x = std::unique_ptr<float[]>(new float[n]);
   for (int i = 0; i < n; ++i) {
 #ifdef PADDLE_WITH_HIP
-    EXPECT_TRUE(
-        hipSuccess ==
-        hipMemcpy(host_x.get(), x, n * sizeof(float), hipMemcpyDeviceToHost));
+    EXPECT_TRUE(hipSuccess == hipMemcpy(host_x.get(),
+                                        (x->ptr()),
+                                        n * sizeof(float),
+                                        hipMemcpyDeviceToHost));
 #else
-    EXPECT_TRUE(
-        cudaSuccess ==
-        cudaMemcpy(host_x.get(), x, n * sizeof(float), cudaMemcpyDeviceToHost));
+    EXPECT_TRUE(cudaSuccess == cudaMemcpy(host_x.get(),
+                                          (x->ptr()),
+                                          n * sizeof(float),
+                                          cudaMemcpyDeviceToHost));
 #endif
     EXPECT_GE(host_x[i] + DELTA, 3.14159f * i);
     EXPECT_LE(host_x[i] - DELTA, 3.14159f * i);
   }
 }
 
-void MultiStreamCompute(float **data,
-                        float **second_data,
-                        const phi::GPUContext &ctx) {
+void MultiStreamCompute(const AllocationPtr &first_data,
+                        const AllocationPtr &second_data,
+                        phi::GPUContext *ctx) {
   // multi-streams
-  AllocationPtr allocation_ptr =
-      Alloc(ctx.GetPlace(),
-            N * sizeof(float),
-            phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
-  EXPECT_GE(allocation_ptr->size(), N * sizeof(float));
-  *data = reinterpret_cast<float *>(allocation_ptr->ptr());
+  EXPECT_GE(first_data->size(), N * sizeof(float));
+
 #ifdef PADDLE_WITH_HIP
-  hipLaunchKernelGGL((kernel), dim3(1), dim3(64), 0, ctx.stream(), *data, N);
+  hipLaunchKernelGGL((kernel),
+                     dim3(1),
+                     dim3(64),
+                     0,
+                     ctx->stream(),
+                     reinterpret_cast<float *>(first_data->ptr()),
+                     N);
 #else
-  kernel<<<1, 64, 0, ctx.stream()>>>(*data, N);
+  kernel<<<1, 64, 0, ctx->stream()>>>(
+      reinterpret_cast<float *>(first_data->ptr()), N);
 #endif
 
+  EXPECT_GE(second_data->size(), N * sizeof(float));
   // allocate and compute on same stream again
-  allocation_ptr =
-      Alloc(ctx.GetPlace(),
-            N * sizeof(float),
-            phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
-  EXPECT_GE(allocation_ptr->size(), N * sizeof(float));
-  *second_data = reinterpret_cast<float *>(allocation_ptr->ptr());
+
 #ifdef PADDLE_WITH_HIP
-  hipLaunchKernelGGL(
-      (kernel), dim3(1), dim3(64), 0, ctx.stream(), *second_data, N);
+  hipLaunchKernelGGL((kernel),
+                     dim3(1),
+                     dim3(64),
+                     0,
+                     ctx->stream(),
+                     reinterpret_cast<float *>(second_data->ptr()),
+                     N);
 #else
-  kernel<<<1, 64, 0, ctx.stream()>>>(*second_data, N);
+  kernel<<<1, 64, 0, ctx->stream()>>>(
+      reinterpret_cast<float *>(second_data->ptr()), N);
 #endif
 }
 
@@ -100,23 +105,26 @@ TEST(Malloc, GPUContextMultiStream) {
 
   AllocationPtr main_stream_alloc_ptr = Alloc(place, N * sizeof(float));
   EXPECT_GE(main_stream_alloc_ptr->size(), N * sizeof(float));
-  float *main_stream_data =
-      reinterpret_cast<float *>(main_stream_alloc_ptr->ptr());
 
-  float *data[NUM_STREAMS];
-  float *second_data[NUM_STREAMS];
-  CudaDevCtxVec dev_ctx;
+  AllocationPtr first_data[NUM_STREAMS], second_data[NUM_STREAMS];
+  std::vector<phi::GPUContext *> dev_ctx;
 
 // default stream
 #ifdef PADDLE_WITH_HIP
-  hipLaunchKernelGGL((kernel), dim3(1), dim3(64), 0, 0, main_stream_data, N);
+  hipLaunchKernelGGL((kernel),
+                     dim3(1),
+                     dim3(64),
+                     0,
+                     0,
+                     reinterpret_cast<float *>(main_stream_alloc_ptr->ptr()),
+                     N);
 #else
-  kernel<<<1, 64>>>(main_stream_data, N);
+  kernel<<<1, 64>>>(reinterpret_cast<float *>(main_stream_alloc_ptr->ptr()), N);
 #endif
   main_stream_alloc_ptr.reset();
 
   for (int i = 0; i < NUM_STREAMS; ++i) {
-    auto ctx = std::make_unique<phi::GPUContext>(place);
+    auto ctx = new phi::GPUContext(place);
     ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(place, ctx->stream())
                           .get());
@@ -133,8 +141,16 @@ TEST(Malloc, GPUContextMultiStream) {
             .GetAllocator(paddle::platform::CUDAPinnedPlace())
             .get());
     ctx->PartialInitWithAllocator();
-    dev_ctx.emplace_back(std::move(ctx));
-    MultiStreamCompute(&data[i], &second_data[i], *dev_ctx[i]);
+    dev_ctx.emplace_back(ctx);
+    first_data[i] =
+        Alloc(ctx->GetPlace(),
+              N * sizeof(float),
+              phi::Stream(reinterpret_cast<phi::StreamId>(ctx->stream())));
+    second_data[i] =
+        Alloc(ctx->GetPlace(),
+              N * sizeof(float),
+              phi::Stream(reinterpret_cast<phi::StreamId>(ctx->stream())));
+    MultiStreamCompute(first_data[i], second_data[i], ctx);
   }
 
 #ifdef PADDLE_WITH_HIP
@@ -142,10 +158,21 @@ TEST(Malloc, GPUContextMultiStream) {
 #else
   EXPECT_TRUE(cudaSuccess == cudaDeviceSynchronize());
 #endif
+
   for (int i = 0; i < NUM_STREAMS; ++i) {
-    CheckKernelOutput(data[i], N);
+    CheckKernelOutput(first_data[i], N);
     CheckKernelOutput(second_data[i], N);
   }
+
+  // For cudaMallocAsyncAllocator, cudaFreeAsync is executed on _malloc_stream,
+  // which is the stream passed at Alloc(). Therefore, the stream must be
+  // postponed until the the memory is freed. Otherwise, the stream would be
+  // destroyed before the cudaFreeAsync is called.
+  for (int i = 0; i < NUM_STREAMS; i++) {
+    first_data[i].release();
+    second_data[i].release();
+    delete dev_ctx[i];
+  }
 }
 
 TEST(Malloc, GPUContextMultiThreadMultiStream) {
@@ -154,24 +181,27 @@ TEST(Malloc, GPUContextMultiThreadMultiStream) {
 
   AllocationPtr main_stream_alloc_ptr = Alloc(place, N * sizeof(float));
   EXPECT_GE(main_stream_alloc_ptr->size(), N * sizeof(float));
-  float *main_stream_data =
-      reinterpret_cast<float *>(main_stream_alloc_ptr->ptr());
 
-  float *data[NUM_STREAMS];
-  float *second_data[NUM_STREAMS];
-  CudaDevCtxVec dev_ctx;
-  std::vector<std::thread> threads;
+  AllocationPtr first_data[NUM_STREAMS], second_data[NUM_STREAMS];
+  std::vector<phi::GPUContext *> dev_ctx;
 
 // default stream
 #ifdef PADDLE_WITH_HIP
-  hipLaunchKernelGGL((kernel), dim3(1), dim3(64), 0, 0, main_stream_data, N);
+  hipLaunchKernelGGL((kernel),
+                     dim3(1),
+                     dim3(64),
+                     0,
+                     0,
+                     reinterpret_cast<float *>(main_stream_alloc_ptr->ptr()),
+                     N);
 #else
-  kernel<<<1, 64>>>(main_stream_data, N);
+  kernel<<<1, 64>>>(reinterpret_cast<float *>(main_stream_alloc_ptr->ptr()), N);
 #endif
   main_stream_alloc_ptr.reset();
+  std::vector<std::thread> threads;
 
   for (int i = 0; i < NUM_STREAMS; ++i) {
-    auto ctx = std::make_unique<phi::GPUContext>(place);
+    auto ctx = new phi::GPUContext(place);
     ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(place, ctx->stream())
                           .get());
@@ -192,23 +222,43 @@ TEST(Malloc, GPUContextMultiThreadMultiStream) {
             .GetAllocator(paddle::platform::CUDAPinnedPlace())
             .get());
     ctx->PartialInitWithAllocator();
-    dev_ctx.emplace_back(std::move(ctx));
-    threads.emplace_back(
-        MultiStreamCompute, &data[i], &second_data[i], std::cref(*dev_ctx[i]));
+    dev_ctx.emplace_back(ctx);
+    first_data[i] =
+        Alloc(ctx->GetPlace(),
+              N * sizeof(float),
+              phi::Stream(reinterpret_cast<phi::StreamId>(ctx->stream())));
+    second_data[i] =
+        Alloc(ctx->GetPlace(),
+              N * sizeof(float),
+              phi::Stream(reinterpret_cast<phi::StreamId>(ctx->stream())));
+    threads.emplace_back(MultiStreamCompute,
+                         std::ref(first_data[i]),
+                         std::ref(second_data[i]),
+                         ctx);
   }
 
   for (int i = 0; i < NUM_STREAMS; ++i) {
     threads[i].join();
   }
+
 #ifdef PADDLE_WITH_HIP
   EXPECT_TRUE(hipSuccess == hipDeviceSynchronize());
 #else
   EXPECT_TRUE(cudaSuccess == cudaDeviceSynchronize());
 #endif
+
   for (int i = 0; i < NUM_STREAMS; ++i) {
-    CheckKernelOutput(data[i], N);
+    CheckKernelOutput(first_data[i], N);
     CheckKernelOutput(second_data[i], N);
   }
+
+  // There are dependencies on the pointer deconstructing. Manually
+  // release the pointers would resolve the conflict.
+  for (int i = 0; i < NUM_STREAMS; i++) {
+    first_data[i].release();
+    second_data[i].release();
+    delete dev_ctx[i];
+  }
 }
 
 TEST(Malloc, AllocZero) {
diff --git a/test/cpp/inference/infer_ut/run.sh b/test/cpp/inference/infer_ut/run.sh
index 88cdb3bacc1e5..91fd69d5e76f6 100755
--- a/test/cpp/inference/infer_ut/run.sh
+++ b/test/cpp/inference/infer_ut/run.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/test/cpp/pir/cinn/compilation_task_test.cc b/test/cpp/pir/cinn/compilation_task_test.cc
index 254ab7c4baf8a..3fbe4ed4ba60b 100644
--- a/test/cpp/pir/cinn/compilation_task_test.cc
+++ b/test/cpp/pir/cinn/compilation_task_test.cc
@@ -24,6 +24,7 @@
 #include "paddle/cinn/hlir/dialect/operator/ir/op_attribute.h"
 #include "paddle/cinn/hlir/dialect/operator/ir/op_dialect.h"
 #include "paddle/cinn/hlir/framework/pir/compilation_task.h"
+#include "paddle/cinn/hlir/framework/pir/utils.h"
 #include "paddle/cinn/hlir/framework/pir_compiler.h"
 #include "paddle/cinn/utils/data_util.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
@@ -34,6 +35,7 @@
 
 PD_DECLARE_bool(cinn_bucket_compile);
 
+using cinn::hlir::framework::pir::CompatibleInfo;
 using cinn::hlir::framework::pir::OpLoweringGroup;
 using cinn::hlir::framework::pir::OpLoweringGroupPtr;
 
@@ -50,8 +52,11 @@ ProgramInfo BuildProgram(std::vector<int64_t> input_shape) {
       input_shape, value_one, phi::DataType::FLOAT32, phi::GPUPlace());
 
   std::vector<OpLoweringGroupPtr> groups;
+  const std::string fn_name = CompatibleInfo::GroupOpsName(
+      std::initializer_list<::pir::Operation*>({full_op_x.operation()}));
   groups.emplace_back(std::make_shared<OpLoweringGroup>(
-      std::initializer_list<::pir::Operation*>({full_op_x.operation()})));
+      std::initializer_list<::pir::Operation*>({full_op_x.operation()}),
+      fn_name));
   groups.back()->mut_output_ops().insert(full_op_x.operation());
 
   return {program, groups};
diff --git a/test/cpp/pir/cinn/file_tile_config_test.cc b/test/cpp/pir/cinn/file_tile_config_test.cc
index 3cdcc7a390bbe..d863baca924f7 100644
--- a/test/cpp/pir/cinn/file_tile_config_test.cc
+++ b/test/cpp/pir/cinn/file_tile_config_test.cc
@@ -39,7 +39,7 @@ TEST(ConfigSearcher, TestReduceDemo) {
   constexpr int kMaxThreadsPerBlock = 1024;
 
   // Step 1: Construct iter space and tile config.
-  cinn::ir::search::IterSpace iter_space;
+  cinn::ir::BucketInfo bucket_info;
   int s_dimension_lower = 32;
   int s_dimension_upper = 128;
   auto s_dimension_type = "S";
@@ -49,61 +49,52 @@ TEST(ConfigSearcher, TestReduceDemo) {
   auto r_dimension_type = "R";
   auto r_dimension_is_dynamic = true;
 
-  iter_space.space.push_back(cinn::ir::search::IterSpace::Dimension{
-      s_dimension_lower,
-      s_dimension_upper,
-      s_dimension_type,
-      s_dimension_is_dynamic,
-      std::vector<double>(128 - 32, 1.0)});
-  iter_space.space.push_back(
-      cinn::ir::search::IterSpace::Dimension{r_dimension_lower,
-                                             r_dimension_upper,
-                                             r_dimension_type,
-                                             r_dimension_is_dynamic,
-                                             std::vector<double>(1, 1.0)});
-  cinn::ir::BucketInfo bucket_info;
-  bucket_info.sp_lower_bound = iter_space.space[0].lower_bound;
-  bucket_info.sp_upper_bound = iter_space.space[0].upper_bound;
-  bucket_info.rb_lower_bound = iter_space.space[1].lower_bound;
-  bucket_info.rb_upper_bound = iter_space.space[1].upper_bound;
+  bucket_info.space.push_back(
+      cinn::ir::BucketInfo::Dimension{s_dimension_lower,
+                                      s_dimension_upper,
+                                      s_dimension_type,
+                                      s_dimension_is_dynamic,
+                                      std::vector<double>(128 - 32, 1.0)});
+  bucket_info.space.push_back(
+      cinn::ir::BucketInfo::Dimension{r_dimension_lower,
+                                      r_dimension_upper,
+                                      r_dimension_type,
+                                      r_dimension_is_dynamic,
+                                      std::vector<double>(1, 1.0)});
+
   cinn::ir::ScheduleConfig::TileConfig tile_config;
   tile_config.spatial_inner_num = 32;
   tile_config.warp_num = 32;
   tile_config.tree_reduce_num = 128;
   std::vector<std::pair<std::string, std::string>> iter_space_type = {
-      std::make_pair("R", "dynamic"), std::make_pair("S", "dynamic")};
+      std::make_pair("S", "dynamic"), std::make_pair("R", "dynamic")};
   // Step 2: Add to json/Read from json
   cinn::ir::FileTileConfigDatabase file_database;
-  file_database.AddConfig(cinn::common::DefaultTarget(),
-                          iter_space_type,
-                          bucket_info,
-                          tile_config,
-                          2);
+  file_database.AddConfig(
+      cinn::common::DefaultTarget(), bucket_info, tile_config, 2);
   cinn::ir::TileConfigMap tile_config_map =
       file_database.GetConfigs(cinn::common::DefaultTarget(), iter_space_type);
   for (auto& it : tile_config_map) {
-    LOG(INFO) << "sp_lower_bound is " << it.first.sp_lower_bound;
-    LOG(INFO) << "sp_upper_bound is " << it.first.sp_upper_bound;
-    LOG(INFO) << "rb_lower_bound is " << it.first.rb_lower_bound;
-    LOG(INFO) << "rb_upper_bound is " << it.first.rb_upper_bound;
+    LOG(INFO) << "bucket info is: ";
+    auto dims = it.first.space.size();
+    for (int i = 0; i < dims; i++) {
+      LOG(INFO) << "Dimension " << i
+                << " 's lower_bound is: " << it.first.space[i].lower_bound;
+      LOG(INFO) << "Dimension " << i
+                << " 's upper_bound is: " << it.first.space[i].upper_bound;
+      auto dimension_lower = i == 0 ? s_dimension_lower : r_dimension_lower;
+      auto dimension_upper = i == 0 ? s_dimension_upper : r_dimension_upper;
+      PADDLE_ENFORCE_EQ(it.first.space[i].lower_bound,
+                        dimension_lower,
+                        ::common::errors::InvalidArgument(
+                            "GetConfigs function gets wrong dimension_lower"));
+      PADDLE_ENFORCE_EQ(it.first.space[i].upper_bound,
+                        dimension_upper,
+                        ::common::errors::InvalidArgument(
+                            "GetConfigs function gets wrong dimension_upper"));
+    }
     LOG(INFO) << "tile config is " << it.second.spatial_inner_num << " "
               << it.second.warp_num << " " << it.second.tree_reduce_num;
-    PADDLE_ENFORCE_EQ(it.first.sp_lower_bound,
-                      s_dimension_lower,
-                      ::common::errors::InvalidArgument(
-                          "GetConfigs function gets wrong s_dimension_lower"));
-    PADDLE_ENFORCE_EQ(it.first.sp_upper_bound,
-                      s_dimension_upper,
-                      ::common::errors::InvalidArgument(
-                          "GetConfigs function gets wrong s_dimension_upper"));
-    PADDLE_ENFORCE_EQ(it.first.rb_lower_bound,
-                      r_dimension_lower,
-                      ::common::errors::InvalidArgument(
-                          "GetConfigs function gets wrong r_dimension_lower"));
-    PADDLE_ENFORCE_EQ(it.first.rb_upper_bound,
-                      r_dimension_upper,
-                      ::common::errors::InvalidArgument(
-                          "GetConfigs function gets wrong r_dimension_upprt"));
     PADDLE_ENFORCE_EQ(it.second.spatial_inner_num,
                       tile_config.spatial_inner_num,
                       ::common::errors::InvalidArgument(
diff --git a/test/cpp/pir/cinn/pir_compiler_test.cc b/test/cpp/pir/cinn/pir_compiler_test.cc
index 8e2df8e02ac8c..622a4fec701f1 100644
--- a/test/cpp/pir/cinn/pir_compiler_test.cc
+++ b/test/cpp/pir/cinn/pir_compiler_test.cc
@@ -25,6 +25,7 @@
 #include "paddle/cinn/hlir/dialect/operator/ir/op_dialect.h"
 #include "paddle/cinn/hlir/dialect/runtime/ir/jit_kernel_op.h"
 #include "paddle/cinn/hlir/dialect/runtime/ir/runtime_dialect.h"
+#include "paddle/cinn/hlir/framework/pir/utils.h"
 #include "paddle/cinn/hlir/framework/pir_compiler.h"
 #include "paddle/cinn/utils/data_util.h"
 #include "paddle/fluid/framework/new_executor/interpretercore.h"
@@ -38,6 +39,7 @@
 #include "paddle/pir/include/core/program.h"
 #include "paddle/pir/include/dialect/control_flow/ir/cf_op.h"
 
+using cinn::hlir::framework::pir::CompatibleInfo;
 using cinn::hlir::framework::pir::OpLoweringGroup;
 using cinn::hlir::framework::pir::OpLoweringGroupPtr;
 
@@ -74,18 +76,26 @@ ProgramInfo BuildProgram() {
   builder.Build<pir::YieldOp>(std::vector<pir::Value>{relu_op_y.result(0)});
 
   std::vector<OpLoweringGroupPtr> groups;
+  const auto full_op_x_ops =
+      std::initializer_list<::pir::Operation*>({full_op_x.operation()});
   groups.emplace_back(std::make_shared<OpLoweringGroup>(
-      std::initializer_list<::pir::Operation*>(
-          {full_op_x.operation()})));  // For coverage
+      full_op_x_ops,
+      CompatibleInfo::GroupOpsName(full_op_x_ops)));  // For coverage
   groups[0]->mut_output_values().push_back(groups[0]->ops().back()->result(0));
+
+  const auto full_op_y_ops =
+      std::initializer_list<::pir::Operation*>({full_op_x.operation()});
   groups.emplace_back(std::make_shared<OpLoweringGroup>(
-      std::initializer_list<::pir::Operation*>({full_op_y.operation()})));
+      full_op_y_ops, CompatibleInfo::GroupOpsName(full_op_y_ops)));
+
   groups[1]->mut_output_values().push_back(groups[1]->ops().back()->result(0));
-  groups.emplace_back(std::make_shared<OpLoweringGroup>(
+  const auto vector_ops =
       std::vector<::pir::Operation*>({tan_op_x.operation(),
                                       relu_op_x.operation(),
                                       tan_op_y.operation(),
-                                      relu_op_y.operation()})));
+                                      relu_op_y.operation()});
+  groups.emplace_back(std::make_shared<OpLoweringGroup>(
+      vector_ops, CompatibleInfo::GroupOpsName(vector_ops)));
   groups[2]->mut_output_values().push_back(groups[2]->ops().back()->result(0));
 
   return {program, groups};
@@ -127,14 +137,16 @@ ProgramInfo BuildSoftmax() {
   auto yield_op = builder.Build<pir::YieldOp>(std::vector<pir::Value>{divide});
 
   std::vector<OpLoweringGroupPtr> groups;
-  groups.emplace_back(std::make_shared<OpLoweringGroup>(
+  const auto vector_ops =
       std::initializer_list<::pir::Operation*>({max.defining_op(),
                                                 broadcast_1.defining_op(),
                                                 sub.defining_op(),
                                                 exp.defining_op(),
                                                 sum.defining_op(),
                                                 broadcast_2.defining_op(),
-                                                divide.defining_op()})));
+                                                divide.defining_op()});
+  groups.emplace_back(std::make_shared<OpLoweringGroup>(
+      vector_ops, CompatibleInfo::GroupOpsName(vector_ops)));
   groups[0]->mut_output_values().push_back(groups[0]->ops().back()->result(0));
   groups[0]->set_op_pattern_kind(cinn::hlir::framework::kReduction);
 
diff --git a/test/cpp/pir/cinn/symbolic_lower_test.cc b/test/cpp/pir/cinn/symbolic_lower_test.cc
index 83de069dd622e..0c748d9b96da8 100644
--- a/test/cpp/pir/cinn/symbolic_lower_test.cc
+++ b/test/cpp/pir/cinn/symbolic_lower_test.cc
@@ -24,6 +24,7 @@
 #include "paddle/cinn/hlir/framework/pir/group.h"
 #include "paddle/cinn/hlir/framework/pir/op_lowering_group.h"
 #include "paddle/cinn/hlir/framework/pir/op_lowering_impl.h"
+#include "paddle/cinn/hlir/framework/pir/utils.h"
 #include "paddle/cinn/hlir/framework/pir_compiler.h"
 #include "paddle/common/ddim.h"
 #include "paddle/fluid/framework/new_executor/interpretercore.h"
@@ -39,6 +40,7 @@
 
 PD_DECLARE_bool(cinn_bucket_compile);
 
+using cinn::hlir::framework::pir::CompatibleInfo;
 using cinn::hlir::framework::pir::OpLoweringGroup;
 using cinn::hlir::framework::pir::OpLoweringGroupPtr;
 
@@ -88,9 +90,11 @@ BuildGroupProgramForLowering() {
   builder.Build<paddle::dialect::FetchOp>(group_op->result(0), "out", 0);
 
   std::vector<OpLoweringGroupPtr> groups;
-  groups.emplace_back(
-      std::make_shared<OpLoweringGroup>(std::vector<::pir::Operation*>(
-          {exp.operation(), reshape.operation(), sub.operation()})));
+  groups.emplace_back(std::make_shared<OpLoweringGroup>(
+      std::vector<::pir::Operation*>(
+          {exp.operation(), reshape.operation(), sub.operation()}),
+      CompatibleInfo::GroupOpsName(std::vector<::pir::Operation*>(
+          {exp.operation(), reshape.operation(), sub.operation()}))));
   groups[0]->mut_output_ops().insert(groups[0]->ops().back());
   std::unordered_map<::pir::Value, symbol::ShapeOrDataDimExprs>
       value_to_shape_data;
@@ -176,9 +180,11 @@ BuildBroadcastGroupProgramForLowering() {
   builder.Build<paddle::dialect::FetchOp>(group_op->result(0), "out", 0);
 
   std::vector<OpLoweringGroupPtr> groups;
-  groups.emplace_back(
-      std::make_shared<OpLoweringGroup>(std::vector<::pir::Operation*>(
-          {x_broadcast.operation(), sub.operation()})));
+  groups.emplace_back(std::make_shared<OpLoweringGroup>(
+      std::vector<::pir::Operation*>(
+          {x_broadcast.operation(), sub.operation()}),
+      CompatibleInfo::GroupOpsName(std::vector<::pir::Operation*>(
+          {x_broadcast.operation(), sub.operation()}))));
   groups[0]->mut_output_ops().insert(groups[0]->ops().back());
 
   std::unordered_map<::pir::Value, symbol::ShapeOrDataDimExprs>
diff --git a/test/cpp/pir/cinn/tile_config_searcher_test.cc b/test/cpp/pir/cinn/tile_config_searcher_test.cc
index f54aa848b655a..289113a96bbab 100644
--- a/test/cpp/pir/cinn/tile_config_searcher_test.cc
+++ b/test/cpp/pir/cinn/tile_config_searcher_test.cc
@@ -66,22 +66,22 @@ TEST(ConfigSearcher, TestReduceDemo) {
   schedule_config_manager.SetPolicy("custom");
 
   // Step 3: Construct iter space and objective function.
-  cinn::ir::search::IterSpace iter_space;
-  iter_space.space.push_back(cinn::ir::search::IterSpace::Dimension{
-      33,
-      128,
-      "S",
-      /* is_dynamic = */ true,
-      std::vector<double>(128 - 32, 1.0)});
-  iter_space.space.push_back(
-      cinn::ir::search::IterSpace::Dimension{1024,
-                                             1024,
-                                             "R",
-                                             /* is_dynamic = */ false,
-                                             std::vector<double>(1, 1.0)});
+  cinn::ir::BucketInfo bucket_info;
+  bucket_info.space.push_back(
+      cinn::ir::BucketInfo::Dimension{33,
+                                      128,
+                                      "S",
+                                      /* is_dynamic = */ true,
+                                      std::vector<double>(128 - 32, 1.0)});
+  bucket_info.space.push_back(
+      cinn::ir::BucketInfo::Dimension{1024,
+                                      1024,
+                                      "R",
+                                      /* is_dynamic = */ false,
+                                      std::vector<double>(1, 1.0)});
   std::unique_ptr<cinn::ir::search::BaseObjectiveFunc> obj_func =
       std::make_unique<cinn::ir::search::WeightedSamplingTrailObjectiveFunc>(
-          program.get(), iter_space);
+          program.get(), bucket_info);
 
   // Step 4: Construct config candidate range and constraints.
   std::vector<std::pair<int, int>> candidate_range{
diff --git a/test/cpp/pir/pattern_rewrite/pattern_rewrite_test.cc b/test/cpp/pir/pattern_rewrite/pattern_rewrite_test.cc
index 9ec1928ef10ff..a7674d60451cd 100644
--- a/test/cpp/pir/pattern_rewrite/pattern_rewrite_test.cc
+++ b/test/cpp/pir/pattern_rewrite/pattern_rewrite_test.cc
@@ -426,7 +426,7 @@ TEST(pattern_rewrite, Patterns) {
   //     true));
 
   CHECK_EQ(pm.Run(&program), true);
-  EXPECT_EQ(program.block()->size(), 19u);
+  EXPECT_EQ(program.block()->size(), 17u);
 }
 
 void BuildConstantFoldingProgram(pir::Program *program,
diff --git a/test/cpp_extension/CMakeLists.txt b/test/cpp_extension/CMakeLists.txt
index 284695e9235a1..517ab10749baf 100644
--- a/test/cpp_extension/CMakeLists.txt
+++ b/test/cpp_extension/CMakeLists.txt
@@ -6,4 +6,6 @@ if(WITH_TESTING)
     set_tests_properties(test_cpp_extension_setup PROPERTIES TIMEOUT 120)
     set_tests_properties(test_cpp_extension_jit PROPERTIES TIMEOUT 120)
   endif()
+  py_test(test_mixed_extension_setup SRCS test_mixed_extension_setup.py)
+  set_tests_properties(test_mixed_extension_setup PROPERTIES TIMEOUT 120)
 endif()
diff --git a/test/deprecated/cpp_extension/mix_relu_and_extension.cc b/test/cpp_extension/mix_relu_and_extension.cc
similarity index 100%
rename from test/deprecated/cpp_extension/mix_relu_and_extension.cc
rename to test/cpp_extension/mix_relu_and_extension.cc
diff --git a/test/deprecated/cpp_extension/mix_relu_and_extension_setup.py b/test/cpp_extension/mix_relu_and_extension_setup.py
similarity index 95%
rename from test/deprecated/cpp_extension/mix_relu_and_extension_setup.py
rename to test/cpp_extension/mix_relu_and_extension_setup.py
index 1576b4f9d23f4..823d0183cfda8 100644
--- a/test/deprecated/cpp_extension/mix_relu_and_extension_setup.py
+++ b/test/cpp_extension/mix_relu_and_extension_setup.py
@@ -13,9 +13,7 @@
 # limitations under the License.
 
 import os
-import sys
 
-sys.path.append("../../cpp_extension")
 from utils import paddle_includes
 
 from paddle.utils.cpp_extension import CppExtension, setup
diff --git a/test/deprecated/cpp_extension/test_mixed_extension_setup.py b/test/cpp_extension/test_mixed_extension_setup.py
similarity index 100%
rename from test/deprecated/cpp_extension/test_mixed_extension_setup.py
rename to test/cpp_extension/test_mixed_extension_setup.py
diff --git a/test/custom_op/CMakeLists.txt b/test/custom_op/CMakeLists.txt
index d59250643b883..950f870261eb8 100644
--- a/test/custom_op/CMakeLists.txt
+++ b/test/custom_op/CMakeLists.txt
@@ -11,6 +11,8 @@ if(WITH_TESTING)
     set_tests_properties(test_custom_relu_op_jit PROPERTIES TIMEOUT 180)
     set_tests_properties(test_custom_relu_model PROPERTIES TIMEOUT 180)
     set_tests_properties(test_context_pool PROPERTIES TIMEOUT 180)
+    py_test(test_custom_cast_op_jit SRCS test_custom_cast_op_jit.py)
+    set_tests_properties(test_custom_cast_op_jit PROPERTIES TIMEOUT 180)
   endif()
 
   if(NOT WIN32)
diff --git a/test/deprecated/custom_op/custom_cast_op.cc b/test/custom_op/custom_cast_op.cc
similarity index 100%
rename from test/deprecated/custom_op/custom_cast_op.cc
rename to test/custom_op/custom_cast_op.cc
diff --git a/test/deprecated/custom_op/test_custom_cast_op_jit.py b/test/custom_op/test_custom_cast_op_jit.py
similarity index 100%
rename from test/deprecated/custom_op/test_custom_cast_op_jit.py
rename to test/custom_op/test_custom_cast_op_jit.py
diff --git a/test/custom_op/test_custom_relu_op_setup.py b/test/custom_op/test_custom_relu_op_setup.py
index f5339d8dcce0a..c32abc0df1615 100644
--- a/test/custom_op/test_custom_relu_op_setup.py
+++ b/test/custom_op/test_custom_relu_op_setup.py
@@ -22,6 +22,7 @@
 
 import paddle
 from paddle import static
+from paddle.pir_utils import test_with_pir_api
 from paddle.utils.cpp_extension.extension_utils import run_cmd
 from paddle.vision.transforms import Compose, Normalize
 
@@ -230,6 +231,7 @@ def _test_dynamic(self):
                     check_output(out, pd_out, "out")
                     check_output(x_grad, pd_x_grad, "x_grad")
 
+    @test_with_pir_api
     def _test_static_save_and_load_inference_model(self):
         paddle.enable_static()
         np_data = np.random.random((1, 1, 28, 28)).astype("float32")
diff --git a/test/deprecated/CMakeLists.txt b/test/deprecated/CMakeLists.txt
index ffaf747a547d0..2254fc3d5f9dd 100644
--- a/test/deprecated/CMakeLists.txt
+++ b/test/deprecated/CMakeLists.txt
@@ -132,12 +132,9 @@ if(WITH_TESTING)
   if(WIN32 AND WIN_UNITTEST_LEVEL LESS 2)
     message(STATUS "Skip tests unrelated to CUDA/TRT")
   else()
-    add_subdirectory(amp)
     add_subdirectory(asp)
-    add_subdirectory(autograd)
     add_subdirectory(custom_op)
     add_subdirectory(custom_runtime)
-    add_subdirectory(cpp_extension)
     add_subdirectory(prim)
     add_subdirectory(standalone_executor)
     add_subdirectory(tokenizer)
@@ -155,11 +152,6 @@ if(WITH_TESTING)
 
   if(WITH_DISTRIBUTE)
     add_subdirectory(collective)
-    add_subdirectory(distributed_passes)
-  endif()
-
-  if(NOT WIN32 OR NOT WITH_GPU)
-    add_subdirectory(fft)
   endif()
 
 endif()
diff --git a/test/deprecated/amp/CMakeLists.txt b/test/deprecated/amp/CMakeLists.txt
deleted file mode 100755
index 60cf0f5fa43d2..0000000000000
--- a/test/deprecated/amp/CMakeLists.txt
+++ /dev/null
@@ -1,47 +0,0 @@
-file(
-  GLOB TEST_OPS
-  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
-  "test_*.py")
-string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
-
-function(py_test_modules TARGET_NAME)
-  if(WITH_TESTING)
-    set(options SERIAL)
-    set(oneValueArgs "")
-    set(multiValueArgs MODULES DEPS ENVS)
-    cmake_parse_arguments(py_test_modules "${options}" "${oneValueArgs}"
-                          "${multiValueArgs}" ${ARGN})
-
-    if(WITH_COVERAGE AND NOT (WITH_INCREMENTAL_COVERAGE
-                              AND "$ENV{PADDLE_GIT_DIFF_PY_FILE}" STREQUAL ""))
-      add_test(
-        NAME ${TARGET_NAME}
-        COMMAND
-          ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python
-          ${py_test_modules_ENVS}
-          COVERAGE_FILE=${PADDLE_BINARY_DIR}/python-coverage.data
-          ${PYTHON_EXECUTABLE} -m coverage run --branch -p
-          ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES}
-        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-    else()
-      add_test(
-        NAME ${TARGET_NAME}
-        COMMAND
-          ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python
-          ${py_test_modules_ENVS} ${PYTHON_EXECUTABLE}
-          ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES}
-        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-    endif()
-
-    if(py_test_modules_SERIAL)
-      set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
-    endif()
-    if(WIN32)
-      set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 150)
-    endif()
-  endif()
-endfunction()
-
-foreach(TEST_OP ${TEST_OPS})
-  py_test_modules(${TEST_OP} MODULES ${TEST_OP})
-endforeach()
diff --git a/test/deprecated/asp/CMakeLists.txt b/test/deprecated/asp/CMakeLists.txt
index 24b7364d5ba68..886d7908d0e0a 100644
--- a/test/deprecated/asp/CMakeLists.txt
+++ b/test/deprecated/asp/CMakeLists.txt
@@ -9,6 +9,6 @@ foreach(TEST_OP ${TEST_OPS})
 endforeach()
 
 set_tests_properties(test_asp_pruning_dynamic PROPERTIES TIMEOUT 30)
-set_tests_properties(test_asp_pruning_static PROPERTIES TIMEOUT 30)
+set_tests_properties(test_asp_pruning_static_deprecated PROPERTIES TIMEOUT 30)
 set_tests_properties(test_asp_optimize_dynamic PROPERTIES TIMEOUT 30)
-set_tests_properties(test_asp_optimize_static PROPERTIES TIMEOUT 30)
+set_tests_properties(test_asp_optimize_static_deprecated PROPERTIES TIMEOUT 30)
diff --git a/test/deprecated/asp/test_asp_customized_pruning.py b/test/deprecated/asp/test_asp_customized_pruning.py
index f17acd61de42a..44bb65682fef0 100644
--- a/test/deprecated/asp/test_asp_customized_pruning.py
+++ b/test/deprecated/asp/test_asp_customized_pruning.py
@@ -18,8 +18,6 @@
 import numpy as np
 
 import paddle
-from paddle import base
-from paddle.base import core
 from paddle.incubate import asp as sparsity
 from paddle.incubate.asp.supported_layer_list import (
     supported_layers_and_prune_func_map,
@@ -184,160 +182,5 @@ def test_training_pruning(self):
         self.assertEqual(supported_layer_count, self.supported_layer_count_ref)
 
 
-class TestASPStaticCustomizedPruneFunc(unittest.TestCase):
-    def setUp(self):
-        paddle.enable_static()
-
-        self.main_program = base.Program()
-        self.startup_program = base.Program()
-
-        self.customer_prefix = "customer_layer"
-
-        def build_model():
-            img = paddle.static.data(
-                name='img', shape=[None, 3, 32, 32], dtype='float32'
-            )
-            label = paddle.static.data(
-                name='label', shape=[None, 1], dtype='int64'
-            )
-            hidden = paddle.static.nn.conv2d(
-                input=img, num_filters=4, filter_size=3, padding=2, act="relu"
-            )
-            hidden = paddle.static.nn.fc(
-                x=hidden, size=32, activation='relu', name=self.customer_prefix
-            )
-            hidden = paddle.static.nn.fc(
-                x=hidden, size=32, activation='relu', name=self.customer_prefix
-            )
-            hidden = paddle.static.nn.fc(x=hidden, size=32, activation='relu')
-            prediction = paddle.static.nn.fc(
-                x=hidden, size=10, activation='softmax'
-            )
-            return img, label, prediction
-
-        with base.program_guard(self.main_program, self.startup_program):
-            self.img, self.label, self.predict = build_model()
-            self.supported_layer_count_ref = 5
-
-        self.place = paddle.CPUPlace()
-        if core.is_compiled_with_cuda():
-            self.place = paddle.CUDAPlace(0)
-        self.exe = base.Executor(self.place)
-
-        sparsity.add_supported_layer(self.customer_prefix, my_own_pruning)
-
-    def test_inference_pruning(self):
-        self.exe.run(self.startup_program)
-
-        sparsity.prune_model(
-            self.main_program, mask_algo="mask_1d", with_mask=False
-        )
-
-        supported_layer_count = 0
-        for param in self.main_program.global_block().all_parameters():
-            mat = np.array(
-                base.global_scope().find_var(param.name).get_tensor()
-            )
-            if sparsity.asp.ASPHelper._is_supported_layer(
-                self.main_program, param.name
-            ):
-                supported_layer_count += 1
-                if self.customer_prefix in param.name:
-                    self.assertLessEqual(
-                        np.sum(mat.flatten() - static_tensor.flatten()), 1e-4
-                    )
-                else:
-                    if (len(param.shape) == 4 and param.shape[1] < 4) or (
-                        len(param.shape) == 2 and param.shape[0] < 4
-                    ):
-                        self.assertFalse(
-                            paddle.incubate.asp.check_sparsity(mat.T, n=2, m=4)
-                        )
-                    else:
-                        self.assertTrue(
-                            sparsity.check_sparsity(
-                                mat.T,
-                                func_name=sparsity.CheckMethod.CHECK_1D,
-                                n=2,
-                                m=4,
-                            )
-                        )
-        self.assertEqual(supported_layer_count, self.supported_layer_count_ref)
-
-    def test_training_pruning(self):
-        with base.program_guard(self.main_program, self.startup_program):
-            loss = paddle.mean(
-                paddle.nn.functional.cross_entropy(
-                    input=self.predict,
-                    label=self.label,
-                    reduction='none',
-                    use_softmax=False,
-                )
-            )
-            optimizer = sparsity.decorate(
-                paddle.optimizer.SGD(learning_rate=0.01)
-            )
-            optimizer.minimize(loss, self.startup_program)
-
-        self.exe.run(self.startup_program)
-
-        sparsity.prune_model(
-            self.main_program, mask_algo="mask_1d", with_mask=True
-        )
-
-        supported_layer_count = 0
-        for param in self.main_program.global_block().all_parameters():
-            mat = np.array(
-                base.global_scope().find_var(param.name).get_tensor()
-            )
-            if sparsity.asp.ASPHelper._is_supported_layer(
-                self.main_program, param.name
-            ):
-                mat_mask = np.array(
-                    base.global_scope()
-                    .find_var(sparsity.asp.ASPHelper._get_mask_name(param.name))
-                    .get_tensor()
-                )
-                supported_layer_count += 1
-                if self.customer_prefix in param.name:
-                    self.assertLessEqual(
-                        np.sum(mat.flatten() - static_tensor.flatten()), 1e-4
-                    )
-                    self.assertLessEqual(
-                        np.sum(
-                            mat_mask.flatten() - static_tensor_mask.flatten()
-                        ),
-                        1e-4,
-                    )
-                else:
-                    if (len(param.shape) == 4 and param.shape[1] < 4) or (
-                        len(param.shape) == 2 and param.shape[0] < 4
-                    ):
-                        self.assertFalse(
-                            sparsity.check_sparsity(mat.T, n=2, m=4)
-                        )
-                        self.assertFalse(
-                            sparsity.check_sparsity(mat_mask.T, n=2, m=4)
-                        )
-                    else:
-                        self.assertTrue(
-                            sparsity.check_sparsity(
-                                mat.T,
-                                func_name=sparsity.CheckMethod.CHECK_1D,
-                                n=2,
-                                m=4,
-                            )
-                        )
-                        self.assertTrue(
-                            sparsity.check_sparsity(
-                                mat_mask.T,
-                                func_name=sparsity.CheckMethod.CHECK_1D,
-                                n=2,
-                                m=4,
-                            )
-                        )
-        self.assertEqual(supported_layer_count, self.supported_layer_count_ref)
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/deprecated/asp/test_asp_customized_pruning_deprecated.py b/test/deprecated/asp/test_asp_customized_pruning_deprecated.py
new file mode 100644
index 0000000000000..c088c1c827f5c
--- /dev/null
+++ b/test/deprecated/asp/test_asp_customized_pruning_deprecated.py
@@ -0,0 +1,205 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle import base
+from paddle.base import core
+from paddle.incubate import asp as sparsity
+from paddle.nn.layer.layers import Layer
+
+
+class MyOwnLayer(Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return x
+
+
+static_tensor = None
+static_tensor_mask = None
+
+
+def my_own_pruning(tensor, m, n, mask_algo, param_name):
+    global static_tensor
+    global static_tensor_mask
+    if static_tensor is None:
+        static_tensor = np.random.rand(*tensor.shape).astype(np.float32)
+    if static_tensor_mask is None:
+        static_tensor_mask = np.random.rand(*tensor.shape).astype(np.float32)
+    return static_tensor, static_tensor_mask
+
+
+class TestASPStaticCustomizedPruneFunc(unittest.TestCase):
+    def setUp(self):
+        paddle.enable_static()
+
+        self.main_program = base.Program()
+        self.startup_program = base.Program()
+
+        self.customer_prefix = "customer_layer"
+
+        def build_model():
+            img = paddle.static.data(
+                name='img', shape=[None, 3, 32, 32], dtype='float32'
+            )
+            label = paddle.static.data(
+                name='label', shape=[None, 1], dtype='int64'
+            )
+            hidden = paddle.static.nn.conv2d(
+                input=img, num_filters=4, filter_size=3, padding=2, act="relu"
+            )
+            hidden = paddle.static.nn.fc(
+                x=hidden, size=32, activation='relu', name=self.customer_prefix
+            )
+            hidden = paddle.static.nn.fc(
+                x=hidden, size=32, activation='relu', name=self.customer_prefix
+            )
+            hidden = paddle.static.nn.fc(x=hidden, size=32, activation='relu')
+            prediction = paddle.static.nn.fc(
+                x=hidden, size=10, activation='softmax'
+            )
+            return img, label, prediction
+
+        with base.program_guard(self.main_program, self.startup_program):
+            self.img, self.label, self.predict = build_model()
+            self.supported_layer_count_ref = 5
+
+        self.place = paddle.CPUPlace()
+        if core.is_compiled_with_cuda():
+            self.place = paddle.CUDAPlace(0)
+        self.exe = base.Executor(self.place)
+
+        sparsity.add_supported_layer(self.customer_prefix, my_own_pruning)
+
+    def test_inference_pruning(self):
+        self.exe.run(self.startup_program)
+
+        sparsity.prune_model(
+            self.main_program, mask_algo="mask_1d", with_mask=False
+        )
+
+        supported_layer_count = 0
+        for param in self.main_program.global_block().all_parameters():
+            mat = np.array(
+                base.global_scope().find_var(param.name).get_tensor()
+            )
+            if sparsity.asp.ASPHelper._is_supported_layer(
+                self.main_program, param.name
+            ):
+                supported_layer_count += 1
+                if self.customer_prefix in param.name:
+                    self.assertLessEqual(
+                        np.sum(mat.flatten() - static_tensor.flatten()), 1e-4
+                    )
+                else:
+                    if (len(param.shape) == 4 and param.shape[1] < 4) or (
+                        len(param.shape) == 2 and param.shape[0] < 4
+                    ):
+                        self.assertFalse(
+                            paddle.incubate.asp.check_sparsity(mat.T, n=2, m=4)
+                        )
+                    else:
+                        self.assertTrue(
+                            sparsity.check_sparsity(
+                                mat.T,
+                                func_name=sparsity.CheckMethod.CHECK_1D,
+                                n=2,
+                                m=4,
+                            )
+                        )
+        self.assertEqual(supported_layer_count, self.supported_layer_count_ref)
+
+    def test_training_pruning(self):
+        with base.program_guard(self.main_program, self.startup_program):
+            loss = paddle.mean(
+                paddle.nn.functional.cross_entropy(
+                    input=self.predict,
+                    label=self.label,
+                    reduction='none',
+                    use_softmax=False,
+                )
+            )
+            optimizer = sparsity.decorate(
+                paddle.optimizer.SGD(learning_rate=0.01)
+            )
+            optimizer.minimize(loss, self.startup_program)
+
+        self.exe.run(self.startup_program)
+
+        sparsity.prune_model(
+            self.main_program, mask_algo="mask_1d", with_mask=True
+        )
+
+        supported_layer_count = 0
+        for param in self.main_program.global_block().all_parameters():
+            mat = np.array(
+                base.global_scope().find_var(param.name).get_tensor()
+            )
+            if sparsity.asp.ASPHelper._is_supported_layer(
+                self.main_program, param.name
+            ):
+                mat_mask = np.array(
+                    base.global_scope()
+                    .find_var(sparsity.asp.ASPHelper._get_mask_name(param.name))
+                    .get_tensor()
+                )
+                supported_layer_count += 1
+                if self.customer_prefix in param.name:
+                    self.assertLessEqual(
+                        np.sum(mat.flatten() - static_tensor.flatten()), 1e-4
+                    )
+                    self.assertLessEqual(
+                        np.sum(
+                            mat_mask.flatten() - static_tensor_mask.flatten()
+                        ),
+                        1e-4,
+                    )
+                else:
+                    if (len(param.shape) == 4 and param.shape[1] < 4) or (
+                        len(param.shape) == 2 and param.shape[0] < 4
+                    ):
+                        self.assertFalse(
+                            sparsity.check_sparsity(mat.T, n=2, m=4)
+                        )
+                        self.assertFalse(
+                            sparsity.check_sparsity(mat_mask.T, n=2, m=4)
+                        )
+                    else:
+                        self.assertTrue(
+                            sparsity.check_sparsity(
+                                mat.T,
+                                func_name=sparsity.CheckMethod.CHECK_1D,
+                                n=2,
+                                m=4,
+                            )
+                        )
+                        self.assertTrue(
+                            sparsity.check_sparsity(
+                                mat_mask.T,
+                                func_name=sparsity.CheckMethod.CHECK_1D,
+                                n=2,
+                                m=4,
+                            )
+                        )
+        self.assertEqual(supported_layer_count, self.supported_layer_count_ref)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/deprecated/asp/test_asp_optimize_static.py b/test/deprecated/asp/test_asp_optimize_static_deprecated.py
similarity index 100%
rename from test/deprecated/asp/test_asp_optimize_static.py
rename to test/deprecated/asp/test_asp_optimize_static_deprecated.py
diff --git a/test/deprecated/asp/test_asp_pruning_static.py b/test/deprecated/asp/test_asp_pruning_static_deprecated.py
similarity index 100%
rename from test/deprecated/asp/test_asp_pruning_static.py
rename to test/deprecated/asp/test_asp_pruning_static_deprecated.py
diff --git a/test/deprecated/asp/test_asp_save_load.py b/test/deprecated/asp/test_asp_save_load.py
index f9966c321b37e..83379f2f5ae3c 100644
--- a/test/deprecated/asp/test_asp_save_load.py
+++ b/test/deprecated/asp/test_asp_save_load.py
@@ -18,7 +18,6 @@
 import numpy as np
 
 import paddle
-from paddle import base
 from paddle.base import core
 from paddle.incubate.asp import ASPHelper
 
@@ -120,90 +119,5 @@ def test_save_and_load(self):
                     )
 
 
-class TestASPStaticOptimize(unittest.TestCase):
-    def setUp(self):
-        paddle.enable_static()
-
-        self.main_program = base.Program()
-        self.startup_program = base.Program()
-
-        def build_model():
-            img = paddle.static.data(
-                name='img', shape=[None, 3, 32, 32], dtype='float32'
-            )
-            label = paddle.static.data(
-                name='label', shape=[None, 1], dtype='int64'
-            )
-            hidden = paddle.static.nn.conv2d(
-                input=img, num_filters=4, filter_size=3, padding=2, act="relu"
-            )
-            hidden = paddle.static.nn.fc(x=hidden, size=32, activation='relu')
-            prediction = paddle.static.nn.fc(
-                x=hidden, size=10, activation='softmax'
-            )
-            return img, label, prediction
-
-        with base.program_guard(self.main_program, self.startup_program):
-            self.img, self.label, predict = build_model()
-            self.loss = paddle.mean(
-                paddle.nn.functional.cross_entropy(
-                    input=predict,
-                    label=self.label,
-                    reduction='none',
-                    use_softmax=False,
-                )
-            )
-            self.optimizer = paddle.optimizer.SGD(learning_rate=0.01)
-            self.optimizer = paddle.incubate.asp.decorate(self.optimizer)
-            self.optimizer.minimize(self.loss, self.startup_program)
-
-        self.place = paddle.CPUPlace()
-        if core.is_compiled_with_cuda():
-            self.place = paddle.CUDAPlace(0)
-        self.exe = base.Executor(self.place)
-        self.exe.run(self.startup_program)
-
-        paddle.incubate.asp.prune_model(self.main_program)
-
-    def test_save_and_load(self):
-        path = "/tmp/paddle_asp_save_st/"
-        param_path = path + "asp.pdparams"
-        model_path = path + "asp.pdmodel"
-
-        paddle.save(self.main_program.state_dict(), param_path)
-        paddle.save(self.main_program, model_path)
-
-        prog = paddle.load(model_path)
-
-        state_dict = paddle.load(param_path)
-        prog.set_state_dict(state_dict)
-
-        feeder = base.DataFeeder(
-            feed_list=[self.img, self.label], place=self.place
-        )
-
-        data = (
-            np.random.randn(64, 3, 32, 32),
-            np.random.randint(10, size=(64, 1)),
-        )
-        self.exe.run(prog, feed=feeder.feed([data]))
-
-        for param in prog.global_block().all_parameters():
-            if ASPHelper._is_supported_layer(prog, param.name):
-                mat = np.array(
-                    base.global_scope().find_var(param.name).get_tensor()
-                )
-                if (len(param.shape) == 4 and param.shape[1] < 4) or (
-                    len(param.shape) == 2 and param.shape[0] < 4
-                ):
-                    self.assertFalse(
-                        paddle.incubate.asp.check_sparsity(mat.T, n=2, m=4)
-                    )
-                else:
-                    self.assertTrue(
-                        paddle.incubate.asp.check_sparsity(mat.T, n=2, m=4)
-                    )
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/deprecated/asp/test_asp_save_load_deprecated.py b/test/deprecated/asp/test_asp_save_load_deprecated.py
new file mode 100644
index 0000000000000..28386b1d2df54
--- /dev/null
+++ b/test/deprecated/asp/test_asp_save_load_deprecated.py
@@ -0,0 +1,131 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle import base
+from paddle.base import core
+from paddle.incubate.asp import ASPHelper
+
+
+class MyLayer(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = paddle.nn.Conv2D(
+            in_channels=3, out_channels=4, kernel_size=3, padding=2
+        )
+        self.linear1 = paddle.nn.Linear(4624, 32)
+        self.linear2 = paddle.nn.Linear(32, 32)
+        self.linear3 = paddle.nn.Linear(32, 10)
+
+    def forward(self, img):
+        hidden = self.conv1(img)
+        hidden = paddle.flatten(hidden, start_axis=1)
+        hidden = self.linear1(hidden)
+        hidden = self.linear2(hidden)
+        prediction = self.linear3(hidden)
+        return prediction
+
+
+class TestASPStaticOptimize(unittest.TestCase):
+    def setUp(self):
+        paddle.enable_static()
+
+        self.main_program = base.Program()
+        self.startup_program = base.Program()
+
+        def build_model():
+            img = paddle.static.data(
+                name='img', shape=[None, 3, 32, 32], dtype='float32'
+            )
+            label = paddle.static.data(
+                name='label', shape=[None, 1], dtype='int64'
+            )
+            hidden = paddle.static.nn.conv2d(
+                input=img, num_filters=4, filter_size=3, padding=2, act="relu"
+            )
+            hidden = paddle.static.nn.fc(x=hidden, size=32, activation='relu')
+            prediction = paddle.static.nn.fc(
+                x=hidden, size=10, activation='softmax'
+            )
+            return img, label, prediction
+
+        with base.program_guard(self.main_program, self.startup_program):
+            self.img, self.label, predict = build_model()
+            self.loss = paddle.mean(
+                paddle.nn.functional.cross_entropy(
+                    input=predict,
+                    label=self.label,
+                    reduction='none',
+                    use_softmax=False,
+                )
+            )
+            self.optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+            self.optimizer = paddle.incubate.asp.decorate(self.optimizer)
+            self.optimizer.minimize(self.loss, self.startup_program)
+
+        self.place = paddle.CPUPlace()
+        if core.is_compiled_with_cuda():
+            self.place = paddle.CUDAPlace(0)
+        self.exe = base.Executor(self.place)
+        self.exe.run(self.startup_program)
+
+        paddle.incubate.asp.prune_model(self.main_program)
+
+    def test_save_and_load(self):
+        path = "/tmp/paddle_asp_save_st/"
+        param_path = path + "asp.pdparams"
+        model_path = path + "asp.pdmodel"
+
+        paddle.save(self.main_program.state_dict(), param_path)
+        paddle.save(self.main_program, model_path)
+
+        prog = paddle.load(model_path)
+
+        state_dict = paddle.load(param_path)
+        prog.set_state_dict(state_dict)
+
+        feeder = base.DataFeeder(
+            feed_list=[self.img, self.label], place=self.place
+        )
+
+        data = (
+            np.random.randn(64, 3, 32, 32),
+            np.random.randint(10, size=(64, 1)),
+        )
+        self.exe.run(prog, feed=feeder.feed([data]))
+
+        for param in prog.global_block().all_parameters():
+            if ASPHelper._is_supported_layer(prog, param.name):
+                mat = np.array(
+                    base.global_scope().find_var(param.name).get_tensor()
+                )
+                if (len(param.shape) == 4 and param.shape[1] < 4) or (
+                    len(param.shape) == 2 and param.shape[0] < 4
+                ):
+                    self.assertFalse(
+                        paddle.incubate.asp.check_sparsity(mat.T, n=2, m=4)
+                    )
+                else:
+                    self.assertTrue(
+                        paddle.incubate.asp.check_sparsity(mat.T, n=2, m=4)
+                    )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/deprecated/autograd/CMakeLists.txt b/test/deprecated/autograd/CMakeLists.txt
deleted file mode 100644
index 35e12e591aea8..0000000000000
--- a/test/deprecated/autograd/CMakeLists.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-file(
-  GLOB TEST_OPS
-  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
-  "test_*.py")
-string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
-set(GC_ENVS FLAGS_eager_delete_tensor_gb=0.0)
-
-foreach(TEST_OP ${TEST_OPS})
-  py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS ${GC_ENVS})
-endforeach()
-
-set_tests_properties(test_autograd_functional_static PROPERTIES TIMEOUT 160)
diff --git a/test/deprecated/autograd/config.py b/test/deprecated/autograd/config.py
deleted file mode 100644
index ff2d64a43bbc9..0000000000000
--- a/test/deprecated/autograd/config.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import paddle
-
-DEVICES = [paddle.CPUPlace()]
-if paddle.is_compiled_with_cuda():
-    DEVICES.append(paddle.CUDAPlace(0))
-
-DEFAULT_DTYPE = 'float64'
-
-# The numerical tolerance of different dtype of different order different
-# derivative. It's a empirical value provided by Paddle Science team.
-TOLERANCE = {
-    "float32": {
-        "first_order_grad": {"rtol": 1e-3, "atol": 1e-3, "eps": 1e-4},
-        "second_order_grad": {"rtol": 1e-2, "atol": 1e-2, "eps": 1e-2},
-    },
-    "float64": {
-        "first_order_grad": {"rtol": 1e-7, "atol": 1e-7, "eps": 1e-7},
-        "second_order_grad": {"rtol": 1e-5, "atol": 1e-5, "eps": 1e-5},
-    },
-}
diff --git a/test/deprecated/autograd/utils.py b/test/deprecated/autograd/utils.py
deleted file mode 100644
index 64a16897d9b25..0000000000000
--- a/test/deprecated/autograd/utils.py
+++ /dev/null
@@ -1,454 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import enum
-import sys
-import typing
-
-import numpy as np
-
-import paddle
-from paddle.incubate.autograd.utils import as_tensors
-
-
-##########################################################
-# Finite Difference Utils
-##########################################################
-def _product(t):
-    return int(np.prod(t))
-
-
-def _get_item(t, idx):
-    assert isinstance(
-        t, paddle.base.framework.Variable
-    ), "The first argument t must be Tensor."
-    assert isinstance(
-        idx, int
-    ), "The second argument idx must be an int number."
-    flat_t = paddle.reshape(t, [-1])
-    return flat_t.__getitem__(idx)
-
-
-def _set_item(t, idx, value):
-    assert isinstance(
-        t, paddle.base.framework.Variable
-    ), "The first argument t must be Tensor."
-    assert isinstance(
-        idx, int
-    ), "The second argument idx must be an int number."
-    flat_t = paddle.reshape(t, [-1])
-    flat_t.__setitem__(idx, value)
-    return paddle.reshape(flat_t, t.shape)
-
-
-def _compute_numerical_jacobian(func, xs, delta, np_dtype):
-    xs = list(as_tensors(xs))
-    ys = list(as_tensors(func(*xs)))
-    fin_size = len(xs)
-    fout_size = len(ys)
-    jacobian = [[] for _ in range(fout_size)]
-    for i in range(fout_size):
-        jac_i = [[] for _ in range(fin_size)]
-        for j in range(fin_size):
-            jac_i[j] = np.zeros(
-                (_product(ys[i].shape), _product(xs[j].shape)), dtype=np_dtype
-            )
-        jacobian[i] = jac_i
-
-    for j in range(fin_size):
-        for q in range(_product(xs[j].shape)):
-            orig = _get_item(xs[j], q)
-            orig = paddle.assign(orig)
-            x_pos = orig + delta
-            xs[j] = _set_item(xs[j], q, x_pos)
-            ys_pos = as_tensors(func(*xs))
-
-            x_neg = orig - delta
-            xs[j] = _set_item(xs[j], q, x_neg)
-            ys_neg = as_tensors(func(*xs))
-
-            xs[j] = _set_item(xs[j], q, orig)
-
-            for i in range(fout_size):
-                for p in range(_product(ys[i].shape)):
-                    y_pos = _get_item(ys_pos[i], p)
-                    y_neg = _get_item(ys_neg[i], p)
-                    jacobian[i][j][p][q] = (y_pos - y_neg) / delta / 2.0
-    return jacobian
-
-
-def _compute_numerical_hessian(func, xs, delta, np_dtype):
-    xs = list(as_tensors(xs))
-    ys = list(as_tensors(func(*xs)))
-    fin_size = len(xs)
-    hessian = [[] for _ in range(fin_size)]
-    for i in range(fin_size):
-        hessian_i = [[] for _ in range(fin_size)]
-        for j in range(fin_size):
-            hessian_i[j] = np.zeros(
-                (_product(xs[i].shape), _product(xs[j].shape)), dtype=np_dtype
-            )
-        hessian[i] = hessian_i
-
-    for i in range(fin_size):
-        for p in range(_product(xs[i].shape)):
-            for j in range(fin_size):
-                for q in range(_product(xs[j].shape)):
-                    orig = _get_item(xs[j], q)
-                    orig = paddle.assign(orig)
-                    x_pos = orig + delta
-                    xs[j] = _set_item(xs[j], q, x_pos)
-                    jacobian_pos = _compute_numerical_jacobian(
-                        func, xs, delta, np_dtype
-                    )
-                    x_neg = orig - delta
-                    xs[j] = _set_item(xs[j], q, x_neg)
-                    jacobian_neg = _compute_numerical_jacobian(
-                        func, xs, delta, np_dtype
-                    )
-                    xs[j] = _set_item(xs[j], q, orig)
-                    hessian[i][j][p][q] = (
-                        (jacobian_pos[0][i][0][p] - jacobian_neg[0][i][0][p])
-                        / delta
-                        / 2.0
-                    )
-    return hessian
-
-
-def concat_to_matrix(xs, is_batched=False):
-    """Concats a tuple of tuple of Jacobian/Hessian matrix into one matrix"""
-    rows = []
-    for i in range(len(xs)):
-        rows.append(np.concatenate(list(xs[i]), -1))
-    return np.concatenate(rows, 1) if is_batched else np.concatenate(rows, 0)
-
-
-def _compute_numerical_batch_jacobian(
-    func, xs, delta, np_dtype, merge_batch=True
-):
-    no_batch_jacobian = _compute_numerical_jacobian(func, xs, delta, np_dtype)
-    xs = list(as_tensors(xs))
-    ys = list(as_tensors(func(*xs)))
-    fin_size = len(xs)
-    fout_size = len(ys)
-    bs = xs[0].shape[0]
-    bat_jac = []
-    for i in range(fout_size):
-        batch_jac_i = []
-        for j in range(fin_size):
-            jac = no_batch_jacobian[i][j]
-            jac_shape = jac.shape
-            out_size = jac_shape[0] // bs
-            in_size = jac_shape[1] // bs
-            jac = np.reshape(jac, (bs, out_size, bs, in_size))
-            batch_jac_i_j = np.zeros(shape=(out_size, bs, in_size))
-            for p in range(out_size):
-                for b in range(bs):
-                    for q in range(in_size):
-                        batch_jac_i_j[p][b][q] = jac[b][p][b][q]
-            if merge_batch:
-                batch_jac_i_j = np.reshape(batch_jac_i_j, (out_size, -1))
-            batch_jac_i.append(batch_jac_i_j)
-        bat_jac.append(batch_jac_i)
-
-    return bat_jac
-
-
-def _compute_numerical_batch_hessian(func, xs, delta, np_dtype):
-    xs = list(as_tensors(xs))
-    batch_size = xs[0].shape[0]
-    fin_size = len(xs)
-    hessian = []
-    for b in range(batch_size):
-        x_l = []
-        for j in range(fin_size):
-            x_l.append(paddle.reshape(xs[j][b], shape=[1, -1]))
-        hes_b = _compute_numerical_hessian(func, x_l, delta, np_dtype)
-        if fin_size == 1:
-            hessian.append(hes_b[0][0])
-        else:
-            hessian.append(hes_b)
-
-    hessian_res = []
-    for index in range(fin_size):
-        x_reshape = paddle.reshape(xs[index], shape=[batch_size, -1])
-        for index_ in range(fin_size):
-            for i in range(x_reshape.shape[1]):
-                tmp = []
-                for j in range(batch_size):
-                    if fin_size == 1:
-                        tmp.extend(hessian[j][i])
-                    else:
-                        tmp.extend(hessian[j][i][index_][index])
-                hessian_res.append(tmp)
-        if fin_size == 1:
-            return hessian_res
-
-    hessian_result = []
-    mid = len(hessian_res) // 2
-    for i in range(mid):
-        hessian_result.append(
-            np.stack((hessian_res[i], hessian_res[mid + i]), axis=0)
-        )
-    return hessian_result
-
-
-def _compute_numerical_vjp(func, xs, v, delta, np_dtype):
-    xs = as_tensors(xs)
-    jacobian = np.array(_compute_numerical_jacobian(func, xs, delta, np_dtype))
-    if v is None:
-        v = [paddle.ones_like(x) for x in xs]
-    flat_v = np.array([v_el.numpy().reshape(-1) for v_el in v])
-    vjp = [np.zeros((_product(x.shape)), dtype=np_dtype) for x in xs]
-    for j in range(len(xs)):
-        for q in range(_product(xs[j].shape)):
-            vjp[j][q] = np.sum(
-                jacobian[:, j, :, q].reshape(flat_v.shape) * flat_v
-            )
-    vjp = [vjp[j].reshape(xs[j].shape) for j in range(len(xs))]
-    return vjp
-
-
-def _compute_numerical_vhp(func, xs, v, delta, np_dtype):
-    xs = list(as_tensors(xs))
-    hessian = np.array(_compute_numerical_hessian(func, xs, delta, np_dtype))
-    flat_v = np.array([v_el.numpy().reshape(-1) for v_el in v])
-    vhp = [np.zeros((_product(x.shape)), dtype=np_dtype) for x in xs]
-    for j in range(len(xs)):
-        for q in range(_product(xs[j].shape)):
-            vhp[j][q] = np.sum(
-                hessian[:, j, :, q].reshape(flat_v.shape) * flat_v
-            )
-    vhp = [vhp[j].reshape(xs[j].shape) for j in range(len(xs))]
-    return vhp
-
-
-##########################################################
-# TestCases of different function.
-##########################################################
-def reduce(x):
-    return paddle.sum(x)
-
-
-def reduce_dim(x):
-    return paddle.sum(x, axis=0)
-
-
-def matmul(x, y):
-    return paddle.matmul(x, y)
-
-
-def mul(x, y):
-    return x * y
-
-
-def pow(x, y):
-    return paddle.pow(x, y)
-
-
-def o2(x, y):
-    return paddle.multiply(x, y), paddle.matmul(x, y.t())
-
-
-def unuse(x, y):
-    return paddle.sum(x)
-
-
-def nested(x):
-    def inner(y):
-        return x * y
-
-    return inner
-
-
-def square(x):
-    return x * x
-
-
-##########################################################
-# Parameterized Test Utils.
-##########################################################
-
-TEST_CASE_NAME = 'suffix'
-
-
-def place(devices, key='place'):
-    """A Decorator for a class which will make the class running on different
-    devices .
-
-    Args:
-        devices (Sequence[Paddle.CUDAPlace|Paddle.CPUPlace]): Device list.
-        key (str, optional): Defaults to 'place'.
-    """
-
-    def decorate(cls):
-        module = sys.modules[cls.__module__].__dict__
-        raw_classes = {
-            k: v for k, v in module.items() if k.startswith(cls.__name__)
-        }
-
-        for raw_name, raw_cls in raw_classes.items():
-            for d in devices:
-                test_cls = dict(raw_cls.__dict__)
-                test_cls.update({key: d})
-                new_name = raw_name + '.' + d.__class__.__name__
-                module[new_name] = type(new_name, (raw_cls,), test_cls)
-            del module[raw_name]
-        return cls
-
-    return decorate
-
-
-def parameterize(fields, values=None):
-    """Decorator for a unittest class which make the class running on different
-    test cases.
-
-    Args:
-        fields (Sequence): The field name sequence of test cases.
-        values (Sequence, optional): The test cases sequence. Defaults to None.
-
-    """
-    fields = [fields] if isinstance(fields, str) else fields
-    params = [dict(zip(fields, vals)) for vals in values]
-
-    def decorate(cls):
-        test_cls_module = sys.modules[cls.__module__].__dict__
-        for i, values in enumerate(params):
-            test_cls = dict(cls.__dict__)
-            values = {
-                k: staticmethod(v) if callable(v) else v
-                for k, v in values.items()
-            }
-            test_cls.update(values)
-            name = cls.__name__ + str(i)
-            name = (
-                name + '.' + values.get('suffix')
-                if values.get('suffix')
-                else name
-            )
-
-            test_cls_module[name] = type(name, (cls,), test_cls)
-
-        for m in list(cls.__dict__):
-            if m.startswith("test"):
-                delattr(cls, m)
-        return cls
-
-    return decorate
-
-
-##########################################################
-# Utils for transpose different Jacobian/Hessian matrix format.
-##########################################################
-
-# B is batch size, N is row size, M is column size.
-MatrixFormat = enum.Enum('MatrixFormat', ('NBM', 'BNM', 'NMB', 'NM'))
-
-
-def _np_transpose_matrix_format(src, src_format, des_format):
-    """Transpose Jacobian/Hessian matrix format."""
-    supported_format = (MatrixFormat.NBM, MatrixFormat.BNM, MatrixFormat.NMB)
-    if src_format not in supported_format or des_format not in supported_format:
-        raise ValueError(
-            f"Supported Jacobian format is {supported_format}, but got src: {src_format}, des: {des_format}"
-        )
-
-    src_axis = {c: i for i, c in enumerate(src_format.name)}
-    dst_axis = tuple(src_axis[c] for c in des_format.name)
-
-    return np.transpose(src, dst_axis)
-
-
-def _np_concat_matrix_sequence(src, src_format=MatrixFormat.NM):
-    """Convert a sequence of sequence of Jacobian/Hessian matrix into one huge
-    matrix."""
-
-    def concat_col(xs):
-        if src_format in (MatrixFormat.NBM, MatrixFormat.BNM, MatrixFormat.NM):
-            return np.concatenate(xs, axis=-1)
-        else:
-            return np.concatenate(xs, axis=1)
-
-    def concat_row(xs):
-        if src_format in (MatrixFormat.NBM, MatrixFormat.NM, MatrixFormat.NMB):
-            return np.concatenate(xs, axis=0)
-        else:
-            return np.concatenate(xs, axis=1)
-
-    supported_format = (
-        MatrixFormat.NBM,
-        MatrixFormat.BNM,
-        MatrixFormat.NMB,
-        MatrixFormat.NM,
-    )
-    if src_format not in supported_format:
-        raise ValueError(
-            f"Supported Jacobian format is {supported_format}, but got {src_format}"
-        )
-    if not isinstance(src, typing.Sequence):
-        return src
-    if not isinstance(src[0], typing.Sequence):
-        src = [src]
-
-    return concat_row(tuple(concat_col(xs) for xs in src))
-
-
-##########################################################
-# Utils for generating test data.
-##########################################################
-def gen_static_data_and_feed(xs, v, stop_gradient=True):
-    feed = {}
-    if isinstance(xs, typing.Sequence):
-        static_xs = []
-        for i, x in enumerate(xs):
-            x = paddle.static.data(f"x{i}", x.shape, x.dtype)
-            x.stop_gradient = stop_gradient
-            static_xs.append(x)
-        feed.update({f'x{idx}': value for idx, value in enumerate(xs)})
-    else:
-        static_xs = paddle.static.data('x', xs.shape, xs.dtype)
-        static_xs.stop_gradient = stop_gradient
-        feed.update({'x': xs})
-
-    if isinstance(v, typing.Sequence):
-        static_v = []
-        for i, e in enumerate(v):
-            e = paddle.static.data(f'v{i}', e.shape, e.dtype)
-            e.stop_gradient = stop_gradient
-            static_v.append(e)
-        feed.update({f'v{i}': value for i, value in enumerate(v)})
-    elif v is not None:
-        static_v = paddle.static.data('v', v.shape, v.dtype)
-        static_v.stop_gradient = stop_gradient
-        feed.update({'v': v})
-    else:
-        static_v = v
-
-    return feed, static_xs, static_v
-
-
-def gen_static_inputs_and_feed(xs, stop_gradient=True):
-    feed = {}
-    if isinstance(xs, typing.Sequence):
-        static_xs = []
-        for i, x in enumerate(xs):
-            x = paddle.static.data(f"x{i}", x.shape, x.dtype)
-            x.stop_gradient = stop_gradient
-            static_xs.append(x)
-        feed.update({f'x{idx}': value for idx, value in enumerate(xs)})
-    else:
-        static_xs = paddle.static.data('x', xs.shape, xs.dtype)
-        static_xs.stop_gradient = stop_gradient
-        feed.update({'x': xs})
-    return feed, static_xs
diff --git a/test/deprecated/collective/fleet/CMakeLists.txt b/test/deprecated/collective/fleet/CMakeLists.txt
index c3d8d0e48e9dc..304f185847917 100644
--- a/test/deprecated/collective/fleet/CMakeLists.txt
+++ b/test/deprecated/collective/fleet/CMakeLists.txt
@@ -13,8 +13,8 @@ endif()
 
 if(LOCAL_ALL_ARCH AND (LINUX OR WIN32))
   py_test_modules(
-    test_fleet_fp16_allreduce_meta_optimizer MODULES
-    test_fleet_fp16_allreduce_meta_optimizer ENVS
+    test_fleet_fp16_allreduce_meta_optimizer_deprecated MODULES
+    test_fleet_fp16_allreduce_meta_optimizer_deprecated ENVS
     "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
 endif()
 
@@ -38,6 +38,7 @@ endif()
 
 if(LOCAL_ALL_ARCH AND (LINUX OR WIN32))
   py_test_modules(
-    test_fleet_meta_optimizer_base MODULES test_fleet_meta_optimizer_base ENVS
+    test_fleet_meta_optimizer_base_deprecated MODULES
+    test_fleet_meta_optimizer_base_deprecated ENVS
     "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
 endif()
diff --git a/test/deprecated/collective/fleet/test_fleet_fp16_allreduce_meta_optimizer.py b/test/deprecated/collective/fleet/test_fleet_fp16_allreduce_meta_optimizer_deprecated.py
similarity index 100%
rename from test/deprecated/collective/fleet/test_fleet_fp16_allreduce_meta_optimizer.py
rename to test/deprecated/collective/fleet/test_fleet_fp16_allreduce_meta_optimizer_deprecated.py
diff --git a/test/deprecated/collective/fleet/test_fleet_meta_optimizer_base.py b/test/deprecated/collective/fleet/test_fleet_meta_optimizer_base_deprecated.py
similarity index 99%
rename from test/deprecated/collective/fleet/test_fleet_meta_optimizer_base.py
rename to test/deprecated/collective/fleet/test_fleet_meta_optimizer_base_deprecated.py
index 8878fdc172e2f..2da076437aede 100755
--- a/test/deprecated/collective/fleet/test_fleet_meta_optimizer_base.py
+++ b/test/deprecated/collective/fleet/test_fleet_meta_optimizer_base_deprecated.py
@@ -22,6 +22,8 @@
     MetaOptimizerBase,
 )
 
+paddle.enable_static()
+
 
 class TestFleetMetaOptimizerBase(unittest.TestCase):
     def net(main_prog, startup_prog):
diff --git a/test/deprecated/contrib/CMakeLists.txt b/test/deprecated/contrib/CMakeLists.txt
index a8ed413e6ce9e..fb82eaa2b6817 100644
--- a/test/deprecated/contrib/CMakeLists.txt
+++ b/test/deprecated/contrib/CMakeLists.txt
@@ -8,4 +8,5 @@ foreach(src ${TEST_OPS})
   py_test(${src} SRCS ${src}.py)
 endforeach()
 
-set_tests_properties(test_image_classification_fp16 PROPERTIES TIMEOUT 120)
+set_tests_properties(test_image_classification_fp16_deprecated
+                     PROPERTIES TIMEOUT 120)
diff --git a/test/deprecated/contrib/test_bf16_utils.py b/test/deprecated/contrib/test_bf16_utils_deprecated.py
similarity index 57%
rename from test/deprecated/contrib/test_bf16_utils.py
rename to test/deprecated/contrib/test_bf16_utils_deprecated.py
index 8bc3cf43b8748..54f3ff73e0099 100644
--- a/test/deprecated/contrib/test_bf16_utils.py
+++ b/test/deprecated/contrib/test_bf16_utils_deprecated.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import copy
 import unittest
 
 import paddle
@@ -22,92 +21,7 @@
 paddle.enable_static()
 
 
-class AMPTest(unittest.TestCase):
-    def setUp(self):
-        self.bf16_list = copy.copy(amp.bf16.amp_lists.bf16_list)
-        self.fp32_list = copy.copy(amp.bf16.amp_lists.fp32_list)
-        self.gray_list = copy.copy(amp.bf16.amp_lists.gray_list)
-        self.amp_lists_ = None
-
-    def tearDown(self):
-        self.assertEqual(self.amp_lists_.bf16_list, self.bf16_list)
-        self.assertEqual(self.amp_lists_.fp32_list, self.fp32_list)
-        self.assertEqual(self.amp_lists_.gray_list, self.gray_list)
-
-    def test_amp_lists(self):
-        self.amp_lists_ = amp.bf16.AutoMixedPrecisionListsBF16()
-
-    def test_amp_lists_1(self):
-        # 1. w={'exp}, b=None
-        self.bf16_list.add('exp')
-        self.fp32_list.remove('exp')
-
-        self.amp_lists_ = amp.bf16.AutoMixedPrecisionListsBF16({'exp'})
-
-    def test_amp_lists_2(self):
-        # 2. w={'tanh'}, b=None
-        self.fp32_list.remove('tan')
-        self.bf16_list.add('tan')
-
-        self.amp_lists_ = amp.bf16.AutoMixedPrecisionListsBF16({'tan'})
-
-    def test_amp_lists_3(self):
-        # 3. w={'lstm'}, b=None
-        self.bf16_list.add('lstm')
-
-        self.amp_lists_ = amp.bf16.AutoMixedPrecisionListsBF16({'lstm'})
-
-    def test_amp_lists_4(self):
-        # 4. w=None, b={'matmul_v2'}
-        self.bf16_list.remove('matmul_v2')
-        self.fp32_list.add('matmul_v2')
-
-        self.amp_lists_ = amp.bf16.AutoMixedPrecisionListsBF16(
-            custom_fp32_list={'matmul_v2'}
-        )
-
-    def test_amp_lists_5(self):
-        # 5. w=None, b={'matmul_v2'}
-        self.fp32_list.add('matmul_v2')
-        self.bf16_list.remove('matmul_v2')
-
-        self.amp_lists_ = amp.bf16.AutoMixedPrecisionListsBF16(
-            custom_fp32_list={'matmul_v2'}
-        )
-
-    def test_amp_lists_6(self):
-        # 6. w=None, b={'lstm'}
-        self.fp32_list.add('lstm')
-
-        self.amp_lists_ = amp.bf16.AutoMixedPrecisionListsBF16(
-            custom_fp32_list={'lstm'}
-        )
-
-    def test_amp_lists_7(self):
-        self.fp32_list.add('reshape2')
-        self.gray_list.remove('reshape2')
-
-        self.amp_lists_ = amp.bf16.AutoMixedPrecisionListsBF16(
-            custom_fp32_list={'reshape2'}
-        )
-
-    def test_amp_list_8(self):
-        self.bf16_list.add('reshape2')
-        self.gray_list.remove('reshape2')
-
-        self.amp_lists_ = amp.bf16.AutoMixedPrecisionListsBF16(
-            custom_bf16_list={'reshape2'}
-        )
-
-
 class AMPTest2(unittest.TestCase):
-    def test_amp_lists_(self):
-        # 7. w={'lstm'} b={'lstm'}
-        # raise ValueError
-        self.assertRaises(
-            ValueError, amp.bf16.AutoMixedPrecisionListsBF16, {'lstm'}, {'lstm'}
-        )
-
     def test_find_op_index(self):
         block = base.default_main_program().global_block()
         op_desc = core.OpDesc()
diff --git a/test/deprecated/contrib/test_image_classification_fp16_deprecated.py b/test/deprecated/contrib/test_image_classification_fp16_deprecated.py
new file mode 100644
index 0000000000000..6c60ad0d8e415
--- /dev/null
+++ b/test/deprecated/contrib/test_image_classification_fp16_deprecated.py
@@ -0,0 +1,101 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import unittest
+
+# TODO: remove sys.path.append
+sys.path.append("../../legacy_test")
+import nets
+
+import paddle
+from paddle.framework import in_pir_mode
+from paddle.static.amp import decorate
+
+paddle.enable_static()
+
+
+def vgg16_bn_drop(input):
+    def conv_block(input, num_filter, groups, dropouts):
+        return nets.img_conv_group(
+            input=input,
+            pool_size=2,
+            pool_stride=2,
+            conv_num_filter=[num_filter] * groups,
+            conv_filter_size=3,
+            conv_act='relu',
+            conv_with_batchnorm=True,
+            conv_batchnorm_drop_rate=dropouts,
+            pool_type='max',
+        )
+
+    conv1 = conv_block(input, 64, 2, [0.3, 0])
+    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
+    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
+    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
+    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
+
+    drop = paddle.nn.functional.dropout(x=conv5, p=0.5)
+    fc1 = paddle.static.nn.fc(x=drop, size=4096, activation=None)
+    if in_pir_mode():
+        batch_norm = paddle.nn.BatchNorm(4096)
+        bn = batch_norm(fc1)
+    else:
+        bn = paddle.static.nn.batch_norm(input=fc1, act='relu')
+    drop2 = paddle.nn.functional.dropout(x=bn, p=0.5)
+    fc2 = paddle.static.nn.fc(x=drop2, size=4096, activation=None)
+    return fc2
+
+
+class TestAmpWithNonIterableDataLoader(unittest.TestCase):
+    def decorate_with_data_loader(self):
+        main_prog = paddle.static.Program()
+        start_prog = paddle.static.Program()
+        with paddle.static.program_guard(main_prog, start_prog):
+            with paddle.base.unique_name.guard():
+                image = paddle.static.data(
+                    name='image', shape=[-1, 3, 224, 224], dtype='float32'
+                )
+                label = paddle.static.data(
+                    name='label', shape=[-1, 1], dtype='int64'
+                )
+
+                net = vgg16_bn_drop(image)
+                logits = paddle.static.nn.fc(
+                    x=net, size=10, activation="softmax"
+                )
+                cost, predict = paddle.nn.functional.softmax_with_cross_entropy(
+                    logits, label, return_softmax=True
+                )
+                avg_cost = paddle.mean(cost)
+
+                optimizer = paddle.optimizer.Lamb(learning_rate=0.001)
+                amp_lists = paddle.static.amp.AutoMixedPrecisionLists(
+                    custom_black_varnames={"loss", "conv2d_0.w_0"}
+                )
+                mp_optimizer = decorate(
+                    optimizer=optimizer,
+                    amp_lists=amp_lists,
+                    init_loss_scaling=8.0,
+                    use_dynamic_loss_scaling=True,
+                )
+
+                mp_optimizer.minimize(avg_cost)
+
+    def test_non_iterable_dataloader(self):
+        self.decorate_with_data_loader()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/deprecated/cpp/prim/CMakeLists.txt b/test/deprecated/cpp/prim/CMakeLists.txt
index 8f7270397a382..4b62d1ef9b8cb 100644
--- a/test/deprecated/cpp/prim/CMakeLists.txt
+++ b/test/deprecated/cpp/prim/CMakeLists.txt
@@ -1,4 +1,4 @@
-paddle_test(test_comp_static SRCS test_static_prim.cc)
+paddle_test(test_comp_static SRCS test_static_prim_deprecated.cc)
 
 if(WITH_ONNXRUNTIME AND WIN32)
   # Copy onnxruntime for some c++ test in Windows, since the test will
diff --git a/test/deprecated/cpp/prim/test_static_prim.cc b/test/deprecated/cpp/prim/test_static_prim_deprecated.cc
similarity index 99%
rename from test/deprecated/cpp/prim/test_static_prim.cc
rename to test/deprecated/cpp/prim/test_static_prim_deprecated.cc
index dfda6cecbb411..a7bb6cbea7720 100644
--- a/test/deprecated/cpp/prim/test_static_prim.cc
+++ b/test/deprecated/cpp/prim/test_static_prim_deprecated.cc
@@ -31,8 +31,7 @@
 PD_DECLARE_bool(prim_enabled);
 COMMON_DECLARE_string(tensor_operants_mode);
 
-namespace paddle {
-namespace prim {
+namespace paddle::prim {
 
 using Tensor = paddle::Tensor;
 struct TestBaseProgram {
@@ -527,5 +526,4 @@ TEST(StaticPrim, TestFlags) {
   ASSERT_FALSE(PrimCommonUtils::IsBwdPrimEnabled());
 }
 
-}  // namespace prim
-}  // namespace paddle
+}  // namespace paddle::prim
diff --git a/test/deprecated/cpp_extension/CMakeLists.txt b/test/deprecated/cpp_extension/CMakeLists.txt
deleted file mode 100644
index 9f4efa9893574..0000000000000
--- a/test/deprecated/cpp_extension/CMakeLists.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-py_test(test_mixed_extension_setup SRCS test_mixed_extension_setup.py)
-set_tests_properties(test_mixed_extension_setup PROPERTIES TIMEOUT 120)
diff --git a/test/deprecated/custom_op/CMakeLists.txt b/test/deprecated/custom_op/CMakeLists.txt
index 346de7ea3c708..dd4fb6d8713ec 100644
--- a/test/deprecated/custom_op/CMakeLists.txt
+++ b/test/deprecated/custom_op/CMakeLists.txt
@@ -1,7 +1,6 @@
 if(WITH_TESTING)
-  py_test(test_custom_raw_op_kernel_op SRCS test_custom_raw_op_kernel_op.py)
-  set_tests_properties(test_custom_raw_op_kernel_op PROPERTIES TIMEOUT 180)
-
-  py_test(test_custom_cast_op_jit SRCS test_custom_cast_op_jit.py)
-  set_tests_properties(test_custom_cast_op_jit PROPERTIES TIMEOUT 180)
+  py_test(test_custom_raw_op_kernel_op_deprecated
+          SRCS test_custom_raw_op_kernel_op_deprecated.py)
+  set_tests_properties(test_custom_raw_op_kernel_op_deprecated
+                       PROPERTIES TIMEOUT 180)
 endif()
diff --git a/test/deprecated/custom_op/test_custom_raw_op_kernel_op.py b/test/deprecated/custom_op/test_custom_raw_op_kernel_op_deprecated.py
similarity index 100%
rename from test/deprecated/custom_op/test_custom_raw_op_kernel_op.py
rename to test/deprecated/custom_op/test_custom_raw_op_kernel_op_deprecated.py
diff --git a/test/deprecated/distributed_passes/CMakeLists.txt b/test/deprecated/distributed_passes/CMakeLists.txt
deleted file mode 100644
index d9ee247cae2ba..0000000000000
--- a/test/deprecated/distributed_passes/CMakeLists.txt
+++ /dev/null
@@ -1,35 +0,0 @@
-file(
-  GLOB TEST_OPS
-  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
-  "test_*.py")
-string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
-
-if((NOT WITH_GPU) AND (NOT WITH_XPU))
-  list(REMOVE_ITEM TEST_OPS "test_dist_fuse_adam_pass")
-  list(REMOVE_ITEM TEST_OPS "test_dist_fuse_all_reduce_pass")
-  list(REMOVE_ITEM TEST_OPS "test_dist_fuse_bn_act_pass")
-  list(REMOVE_ITEM TEST_OPS "test_dist_fuse_bn_add_act_pass")
-  list(REMOVE_ITEM TEST_OPS "test_dist_fuse_momentum_pass")
-  list(REMOVE_ITEM TEST_OPS "test_dist_fuse_relu_depthwise_conv_pass")
-  list(REMOVE_ITEM TEST_OPS "test_dist_fuse_sgd_pass")
-  list(REMOVE_ITEM TEST_OPS "test_dist_inplace_addto_pass")
-  list(REMOVE_ITEM TEST_OPS "test_auto_parallel_amp_pass")
-  list(REMOVE_ITEM TEST_OPS "test_auto_parallel_recompute_pass")
-  list(REMOVE_ITEM TEST_OPS "test_auto_parallel_sharding_pass")
-  list(REMOVE_ITEM TEST_OPS "test_auto_parallel_fp16_pass")
-  list(REMOVE_ITEM TEST_OPS "test_auto_parallel_gradient_merge_pass")
-  list(REMOVE_ITEM TEST_OPS
-       "test_auto_parallel_data_parallel_optimization_pass")
-endif()
-
-if(NOT ((WITH_GPU) AND (CUDA_VERSION GREATER_EQUAL 11.6)))
-  list(REMOVE_ITEM TEST_OPS test_dist_fuse_gemm_epilogue_pass)
-  list(REMOVE_ITEM TEST_OPS test_auto_parallel_fused_linear_promotion_pass)
-endif()
-
-foreach(TEST_OP ${TEST_OPS})
-  py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS "NVIDIA_TF32_OVERRIDE=0")
-  list(APPEND DIST_TEST_OPS ${TEST_OP})
-  set_tests_properties(${TEST_OP} PROPERTIES TIMEOUT 250)
-  set_tests_properties(${TEST_OP} PROPERTIES LABELS "RUN_TYPE=DIST")
-endforeach()
diff --git a/test/deprecated/fft/CMakeLists.txt b/test/deprecated/fft/CMakeLists.txt
deleted file mode 100644
index a31ec8e1f2137..0000000000000
--- a/test/deprecated/fft/CMakeLists.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-file(
-  GLOB TEST_OPS
-  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
-  "test_*.py")
-string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
-
-foreach(TEST_OP ${TEST_OPS})
-  py_test_modules(${TEST_OP} MODULES ${TEST_OP})
-endforeach()
-
-set_pir_tests_properties()
diff --git a/test/deprecated/ir/pir/CMakeLists.txt b/test/deprecated/ir/pir/CMakeLists.txt
index bcb550df74c03..df4ff900910b3 100644
--- a/test/deprecated/ir/pir/CMakeLists.txt
+++ b/test/deprecated/ir/pir/CMakeLists.txt
@@ -8,5 +8,3 @@ foreach(target ${TEST_INTERP_CASES})
   py_test_modules(${target} MODULES ${target} ENVS GLOG_v=1
                   FLAGS_enable_pir_in_executor=true)
 endforeach()
-
-add_subdirectory(translator)
diff --git a/test/deprecated/ir/pir/test_pass_manager.py b/test/deprecated/ir/pir/test_pass_manager.py
deleted file mode 100644
index 852b3595ba492..0000000000000
--- a/test/deprecated/ir/pir/test_pass_manager.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import paddle
-from paddle import pir
-from paddle.base import core
-from paddle.framework import LayerHelper
-
-paddle.enable_static()
-
-
-class TestShadowOutputSlice(unittest.TestCase):
-    def test_op(self):
-        place = core.Place()
-        place.set_place(paddle.CPUPlace())
-        new_scope = paddle.static.Scope()
-        main_program = paddle.static.Program()
-        with paddle.static.scope_guard(new_scope):
-            with paddle.static.program_guard(main_program):
-                x = paddle.ones([3, 9, 5], dtype='float32')
-                y = paddle.static.data(
-                    name="y", shape=[3, 9, 5], dtype="float32"
-                )
-                z = x * y  # will be eliminated
-
-                _, out, _ = paddle.split(x, num_or_sections=3, axis=1)
-                helper = LayerHelper('shadow_output')
-                helper.append_op(
-                    type="shadow_output",
-                    inputs={"x": [out.name]},
-                    outputs={"out": [y.name]},
-                    attrs={"name": out.name},
-                )
-
-        new_program = pir.translate_to_pir(main_program.desc)
-        op_names = [op.name() for op in new_program.global_block().ops]
-        self.assertTrue('pd_op.multiply' in op_names)
-        pm = pir.PassManager()
-        pm.add_pass(
-            'dead_code_elimination_pass', {}
-        )  # apply pass to eliminate dead code
-        pm.run(new_program)
-        op_names = [op.name() for op in new_program.global_block().ops]
-        self.assertEqual(pm.passes(), ['dead_code_elimination_pass'])
-        self.assertFalse(pm.empty())
-        self.assertTrue(
-            'pd_op.multiply' not in op_names
-        )  # multiply is eliminated because its output is not used
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/deprecated/ir/pir/test_special_op_translator.py b/test/deprecated/ir/pir/test_special_op_translator.py
deleted file mode 100644
index 687f0248535ed..0000000000000
--- a/test/deprecated/ir/pir/test_special_op_translator.py
+++ /dev/null
@@ -1,555 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-
-import paddle
-from paddle import pir
-from paddle.base import core
-from paddle.framework import LayerHelper
-
-paddle.enable_static()
-
-
-class TestCastOpTranscriber(unittest.TestCase):
-    def test_op(self):
-        place = core.Place()
-        place.set_place(paddle.CPUPlace())
-        new_scope = paddle.static.Scope()
-        main_program = paddle.static.Program()
-        with paddle.static.scope_guard(new_scope):
-            with paddle.static.program_guard(main_program):
-                x = paddle.to_tensor([2, 3, 4], 'float64')
-                y = paddle.cast(x, 'uint8')
-
-        _, mappings = pir.translate_to_pir_with_param_map(main_program.desc)
-        assert len(str(mappings)) > 0, "no mapping found"
-
-
-class TestCondWithInplace(unittest.TestCase):
-    def test_op(self):
-        def cond_with_inplace():
-            x = paddle.ones(shape=[2, 1, 2, 3], dtype="float32")
-            y = paddle.ones(shape=[2, 1, 2, 3], dtype="float32")
-            running_mean = paddle.to_tensor([0], dtype="float32")
-            running_variance = paddle.to_tensor([1], dtype="float32")
-            weight = paddle.to_tensor([2], dtype="float32")
-            bias = paddle.to_tensor([1], dtype="float32")
-            if x > y:
-                y = paddle.nn.functional.batch_norm(
-                    x, running_mean, running_variance, weight, bias
-                )
-            else:
-                y = paddle.nn.functional.batch_norm(
-                    x, running_mean, running_variance, weight, bias
-                )
-
-        legacy_program = paddle.jit.to_static(
-            cond_with_inplace,
-            input_spec=[],
-            full_graph=True,
-        )
-
-        l = pir.translate_to_pir(legacy_program.main_program.desc)
-        assert l is not None
-
-    def test_nested_op(self):
-        def cond_with_inplace():
-            x = paddle.ones(shape=[2, 1, 2, 3], dtype="float32")
-            y = paddle.ones(shape=[2, 1, 2, 3], dtype="float32")
-            z = paddle.ones(shape=[2, 1, 2, 3], dtype="float32")
-            running_mean = paddle.to_tensor([0], dtype="float32")
-            running_variance = paddle.to_tensor([1], dtype="float32")
-            weight = paddle.to_tensor([2], dtype="float32")
-            bias = paddle.to_tensor([1], dtype="float32")
-            if x > y:
-                if y > z:
-                    z = paddle.nn.functional.batch_norm(
-                        z, running_mean, running_variance, weight, bias
-                    )
-                else:
-                    y = paddle.nn.functional.batch_norm(
-                        x, running_mean, running_variance, weight, bias
-                    )
-            else:
-                if y > z:
-                    z = paddle.nn.functional.batch_norm(
-                        z, running_mean, running_variance, weight, bias
-                    )
-                else:
-                    y = paddle.nn.functional.batch_norm(
-                        x, running_mean, running_variance, weight, bias
-                    )
-
-        legacy_program = paddle.jit.to_static(
-            cond_with_inplace,
-            input_spec=[],
-            full_graph=True,
-        )
-
-        l = pir.translate_to_pir(legacy_program.main_program.desc)
-        assert l is not None
-
-
-class TestElementwiseOpTranscriber(unittest.TestCase):
-    def test_elementwise_without_y_grad(self):
-        place = core.Place()
-        place.set_place(paddle.CPUPlace())
-        exe = paddle.static.Executor(place)
-
-        new_scope = paddle.static.Scope()
-        main_program = paddle.static.Program()
-        with paddle.static.scope_guard(new_scope):
-            with paddle.static.program_guard(main_program):
-                x_data = np.random.rand(100, 2, 3)
-                y_data = np.random.rand(100)
-                x = paddle.to_tensor(x_data, dtype='float32')
-                x.stop_gradient = False
-                y = paddle.to_tensor(y_data, dtype='float32')
-
-                out1 = paddle.tensor.math._elementwise_op(
-                    LayerHelper('elementwise_add', x=x, y=y, axis=0)
-                )
-                out1.stop_gradient = False
-                mean = paddle.mean(out1)
-                paddle.static.append_backward(mean)
-
-                out = exe.run(main_program, {}, fetch_list=[out1])
-                np.testing.assert_allclose(
-                    out[0],
-                    x_data + y_data.reshape(100, 1, 1),
-                    rtol=1e-6,
-                    atol=1e-6,
-                )
-
-    def test_elementwise_with_y_grad(self):
-        place = core.Place()
-        place.set_place(paddle.CPUPlace())
-        exe = paddle.static.Executor(place)
-
-        new_scope = paddle.static.Scope()
-        main_program = paddle.static.Program()
-        with paddle.static.scope_guard(new_scope):
-            with paddle.static.program_guard(main_program):
-                x_data = np.random.rand(100, 2, 3)
-                y_data = np.random.rand(100)
-                x = paddle.to_tensor(x_data, dtype='float32')
-                x.stop_gradient = False
-                y = paddle.to_tensor(y_data, dtype='float32')
-                y.stop_gradient = False
-
-                out1 = paddle.tensor.math._elementwise_op(
-                    LayerHelper('elementwise_add', x=x, y=y, axis=0)
-                )
-                out1.stop_gradient = False
-                mean = paddle.mean(out1)
-                paddle.static.append_backward(mean)
-
-                out = exe.run(main_program, {}, fetch_list=[out1])
-                np.testing.assert_allclose(
-                    out[0],
-                    x_data + y_data.reshape(100, 1, 1),
-                    rtol=1e-6,
-                    atol=1e-6,
-                )
-
-    def test_add_inplace(self):
-        place = core.Place()
-        place.set_place(paddle.CPUPlace())
-        exe = paddle.static.Executor(place)
-
-        new_scope = paddle.static.Scope()
-        main_program = paddle.static.Program()
-        with paddle.static.scope_guard(new_scope):
-            with paddle.static.program_guard(main_program):
-                x = paddle.ones(shape=(100, 2, 3), dtype='float32')
-                y = paddle.ones(shape=(100, 2, 3), dtype='float32')
-
-                helper = LayerHelper('elementwise_add')
-                helper.append_op(
-                    type="elementwise_add",
-                    inputs={"X": x, "Y": y},
-                    outputs={"Out": y},
-                    attrs={"axis": -1},
-                )
-        _ = pir.translate_to_pir(main_program.desc)
-
-
-class TestEmbeddingOpTranscriber(unittest.TestCase):
-    def test_op(self):
-        place = core.Place()
-        place.set_place(paddle.CPUPlace())
-        new_scope = paddle.static.Scope()
-        main_program = paddle.static.Program()
-        with paddle.static.scope_guard(new_scope):
-            with paddle.static.program_guard(main_program):
-                x = paddle.static.data(name="x", shape=[2, 4], dtype=np.int64)
-                embedding = paddle.nn.Embedding(
-                    10, 3, weight_attr=paddle.nn.initializer.Constant(value=1.0)
-                )
-                output = embedding(x)
-
-        _ = pir.translate_to_pir(main_program.desc)
-
-
-class TestIncrementOpTranscriber(unittest.TestCase):
-    def test_op(self):
-        place = core.Place()
-        place.set_place(paddle.CPUPlace())
-        new_scope = paddle.static.Scope()
-        main_program = paddle.static.Program()
-        with paddle.static.scope_guard(new_scope):
-            with paddle.static.program_guard(main_program):
-                data = paddle.zeros(shape=[1], dtype='float32')
-                counter = paddle.increment(data)
-
-        _ = pir.translate_to_pir(main_program.desc)
-
-
-class TestAssignValueOpTranscriber(unittest.TestCase):
-    def test_op(self):
-        place = core.Place()
-        place.set_place(paddle.CPUPlace())
-        new_scope = paddle.static.Scope()
-        main_program = paddle.static.Program()
-        with paddle.static.scope_guard(new_scope):
-            with paddle.static.program_guard(main_program):
-                x = paddle.to_tensor(
-                    [[0.1, 0.2], [0.3, 0.4]],
-                    place=paddle.CPUPlace(),
-                    stop_gradient=False,
-                )
-
-        _ = pir.translate_to_pir(main_program.desc)
-
-
-class TestRnnOpTranscriber(unittest.TestCase):
-    def test_op(self):
-        place = core.Place()
-        place.set_place(paddle.CPUPlace())
-        new_scope = paddle.static.Scope()
-        main_program = paddle.static.Program()
-        with paddle.static.scope_guard(new_scope):
-            with paddle.static.program_guard(main_program):
-                x = paddle.randn((4, 16))
-                prev_h = paddle.randn((4, 32))
-
-                cell = paddle.nn.SimpleRNNCell(16, 32)
-                y, h = cell(x, prev_h)
-
-        _ = pir.translate_to_pir(main_program.desc)
-
-
-class TestEmptyVarTranslate(unittest.TestCase):
-    def test_op(self):
-        place = core.Place()
-        place.set_place(paddle.CPUPlace())
-        new_scope = paddle.static.Scope()
-        main_program = paddle.static.Program()
-        with paddle.static.scope_guard(new_scope):
-            with paddle.static.program_guard(main_program):
-                x1 = paddle.rand(shape=[3, 3], dtype="float32")
-                x1.stop_gradient = False
-                weight = paddle.full(
-                    shape=[3, 3], fill_value="0.5", dtype="float32"
-                )
-                y = paddle.nn.functional.linear(x1, weight)
-                y.stop_gradient = True
-                out1 = paddle.concat(x=[x1, y], axis=1)
-                out2 = paddle.mean(out1)
-                sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.1)
-                sgd_optimizer.minimize(out2)
-        _ = pir.translate_to_pir(main_program.desc)
-
-
-class TestOneHotOpTranscriber(unittest.TestCase):
-    def test_mutable_attribute(self):
-        place = core.Place()
-        place.set_place(paddle.CPUPlace())
-        new_scope = paddle.static.Scope()
-        main_program = paddle.static.Program()
-        with paddle.static.scope_guard(new_scope):
-            with paddle.static.program_guard(main_program):
-                depth = paddle.assign(np.array([10], dtype=np.int32))
-                label = paddle.static.data(
-                    name="label", shape=[-1, 1], dtype="int64"
-                )
-                one_hot_label = paddle.nn.functional.one_hot(
-                    x=label, num_classes=depth
-                )
-
-        _ = pir.translate_to_pir(main_program.desc)
-
-    def test_normal_attribute(self):
-        place = core.Place()
-        place.set_place(paddle.CPUPlace())
-        new_scope = paddle.static.Scope()
-        main_program = paddle.static.Program()
-        with paddle.static.scope_guard(new_scope):
-            with paddle.static.program_guard(main_program):
-                depth = 10
-                label = paddle.static.data(
-                    name="label", shape=[-1, 1], dtype="int64"
-                )
-                one_hot_label = paddle.nn.functional.one_hot(
-                    x=label, num_classes=depth
-                )
-
-        _ = pir.translate_to_pir(main_program.desc)
-
-
-class TestReduceOpTranscriber(unittest.TestCase):
-    def test_reduce_all(self):
-        place = core.Place()
-        place.set_place(paddle.CPUPlace())
-        exe = paddle.static.Executor(place)
-
-        new_scope = paddle.static.Scope()
-        main_program = paddle.static.Program()
-        with paddle.static.scope_guard(new_scope):
-            with paddle.static.program_guard(main_program):
-                arr = np.ones([2, 2], dtype="float32")
-                x = paddle.to_tensor(arr, dtype='int32')
-                out1 = paddle.all(x)
-
-                out = exe.run(main_program, {}, fetch_list=[out1])
-                np.testing.assert_array_equal(out[0], np.all(arr))
-
-    def test_with_axis(self):
-        place = core.Place()
-        place.set_place(paddle.CPUPlace())
-        exe = paddle.static.Executor(place)
-
-        new_scope = paddle.static.Scope()
-        main_program = paddle.static.Program()
-        with paddle.static.scope_guard(new_scope):
-            with paddle.static.program_guard(main_program):
-                arr = np.ones([2, 2], dtype="float32")
-                x = paddle.to_tensor(arr, dtype='int32')
-                out1 = paddle.all(x, axis=0)
-
-                out = exe.run(main_program, {}, fetch_list=[out1])
-                np.testing.assert_array_equal(out[0], np.all(arr, axis=0))
-
-
-class TestIndexPutOpTranscriber(unittest.TestCase):
-    def test_op(self):
-        place = core.Place()
-        place.set_place(paddle.CPUPlace())
-        new_scope = paddle.static.Scope()
-        main_program = paddle.static.Program()
-        with paddle.static.scope_guard(new_scope):
-            with paddle.static.program_guard(main_program):
-                x = paddle.randn([2, 3])
-                indices = [paddle.randint(0, 2, [2]), paddle.randint(0, 1, [2])]
-                value = paddle.randn([2])
-                y = paddle.index_put(x, indices, value, False)
-
-        _ = pir.translate_to_pir(main_program.desc)
-
-
-class TestGradAddOpTranscriber(unittest.TestCase):
-    def test_op(self):
-        place = core.Place()
-        place.set_place(paddle.CPUPlace())
-        new_scope = paddle.static.Scope()
-        main_program = paddle.static.Program()
-        with paddle.static.scope_guard(new_scope):
-            with paddle.static.program_guard(main_program):
-                x_data = np.random.rand(100, 2, 3)
-                y_data = np.random.rand(100, 1, 1)
-                x = paddle.to_tensor(x_data, dtype='float32')
-                x.stop_gradient = False
-                y = paddle.to_tensor(y_data, dtype='float32')
-
-                helper = LayerHelper('grad_add')
-                out = helper.create_variable_for_type_inference("float")
-                helper.append_op(
-                    type="grad_add",
-                    inputs={"X": x, "Y": y},
-                    outputs={"Out": out},
-                    attrs={"axis": -1},
-                )
-
-        _ = pir.translate_to_pir(main_program.desc)
-
-
-class TestShadowOutputSlice(unittest.TestCase):
-    def test_op(self):
-        place = core.Place()
-        place.set_place(paddle.CPUPlace())
-        new_scope = paddle.static.Scope()
-        main_program = paddle.static.Program()
-        with paddle.static.scope_guard(new_scope):
-            with paddle.static.program_guard(main_program):
-                x = paddle.rand([3, 9, 5])
-                y = paddle.static.data(
-                    name="y", shape=[3, 9, 5], dtype="float32"
-                )
-
-                _, out, _ = paddle.split(x, num_or_sections=3, axis=1)
-                helper = LayerHelper('shadow_output')
-                helper.append_op(
-                    type="shadow_output",
-                    inputs={"x": [out.name]},
-                    outputs={"out": [y.name]},
-                    attrs={"name": out.name},
-                )
-
-        l = pir.translate_to_pir(main_program.desc)
-
-
-class TestSetValueOp(unittest.TestCase):
-    def test_no_mutable_attribute(self):
-        place = core.Place()
-        place.set_place(paddle.CPUPlace())
-        exe = paddle.static.Executor(place)
-
-        new_scope = paddle.static.Scope()
-        main_program = paddle.static.Program()
-        with paddle.static.scope_guard(new_scope):
-            with paddle.static.program_guard(main_program):
-                x = paddle.ones(shape=[2, 3, 4], dtype="float32")
-                x = paddle.static.setitem(x, (0, 0), 6)
-        ret = exe.run(main_program, fetch_list=[x])
-
-        x_data = np.ones([2, 3, 4]).astype("float32")
-        x_data[0, 0] = 6
-        np.testing.assert_array_equal(ret[0], x_data)
-
-    def test_with_mutable_attribute(self):
-        place = core.Place()
-        place.set_place(paddle.CPUPlace())
-        exe = paddle.static.Executor(place)
-
-        new_scope = paddle.static.Scope()
-        main_program = paddle.static.Program()
-        with paddle.static.scope_guard(new_scope):
-            with paddle.static.program_guard(main_program):
-                x = paddle.ones(shape=[2, 3, 4], dtype="float32")
-                zero = paddle.full([], 0, dtype="int32")
-                x = paddle.static.setitem(x, zero, 6)
-        ret = exe.run(main_program, fetch_list=[x])
-
-        x_data = np.ones([2, 3, 4]).astype("float32")
-        x_data[0] = 6
-        np.testing.assert_array_equal(ret[0], x_data)
-
-    def test_grad(self):
-        place = core.Place()
-        place.set_place(paddle.CPUPlace())
-        exe = paddle.static.Executor(place)
-        new_scope = paddle.static.Scope()
-        main_program = paddle.static.Program()
-        input_shape = [7, 6, 5, 4, 3, 2]
-        with paddle.static.scope_guard(new_scope):
-            with paddle.static.program_guard(main_program):
-                x = paddle.ones(shape=input_shape, dtype="float32")
-                value = paddle.tensor.fill_constant([1, 3, 2], "float32", 1)
-                # test stop_gradient
-                value.stop_gradient = False
-                x.stop_gradient = False
-                attrs = {
-                    'axes': [0],
-                    'starts': [6],
-                    'ends': [0],
-                    'steps': [-4],
-                    'decrease_axes': [],
-                    'none_axes': [],
-                    'dtype': paddle.float32,
-                }
-                inputs = {'Input': x, 'ValueTensor': value}
-
-                helper = LayerHelper("set_value")
-                y = helper.create_variable_for_type_inference(dtype=x.dtype)
-
-                helper.append_op(
-                    type="set_value",
-                    inputs=inputs,
-                    outputs={'Out': y},
-                    attrs=attrs,
-                )
-                y2 = y + 1
-                loss = paddle.sum(y2)
-                opt = paddle.optimizer.Adam()
-                opt.minimize(loss)
-
-                x_data = np.arange(
-                    0, np.prod(input_shape), dtype="float32"
-                ).reshape(input_shape)
-                fetch_list = [x.grad_name, value.grad_name]
-                ret = exe.run(main_program, fetch_list=fetch_list)
-                self.assertTrue((ret[0][6:0:-4] == 0).all())
-
-
-class TestShareBufferOpTranscriber(unittest.TestCase):
-    def test_program(self):
-        place = core.Place()
-        place.set_place(paddle.CPUPlace())
-
-        new_scope = paddle.static.Scope()
-        main_program = paddle.static.Program()
-        with paddle.static.scope_guard(new_scope):
-            with paddle.static.program_guard(main_program):
-                x = paddle.ones(shape=(100, 2, 3), dtype='float32')
-                y = paddle.ones(shape=(100, 2, 3), dtype='float32')
-
-                helper = LayerHelper('share_buffer')
-                helper.append_op(
-                    type="share_buffer",
-                    inputs={"X": x},
-                    outputs={"Out": y, "XOut": x},
-                )
-        l = pir.translate_to_pir(main_program.desc)
-        assert (
-            l.global_block().ops[2].name() == "pd_op.share_data_"
-        ), "share_buffer should be translated to share_data_"
-
-
-class TestDataOp(unittest.TestCase):
-    def test_data_op(self):
-        place = core.Place()
-        place.set_place(paddle.CPUPlace())
-
-        new_scope = paddle.static.Scope()
-        main_program = paddle.static.Program()
-        with paddle.static.scope_guard(new_scope):
-            with paddle.static.program_guard(main_program):
-                _ = paddle.static.data(name="y", shape=[3, 9, 5], dtype="int64")
-        l = pir.translate_to_pir(main_program.desc)
-        self.assertTrue(len(l.global_block().ops) > 0)
-        self.assertTrue(l.global_block().ops[0].name() == "pd_op.data")
-        data_op = l.global_block().ops[0]
-        self.assertIn("dtype", data_op.attrs())
-        self.assertEqual(str(data_op.attrs()["dtype"]), "paddle.int64")
-
-
-class TestCheckUnregisteredOp(unittest.TestCase):
-    def test_program(self):
-        main_program = paddle.static.Program()
-        with paddle.static.program_guard(main_program):
-            x = paddle.randn((4, 16))
-            prev_h = paddle.randn((4, 32))
-
-            cell = paddle.nn.SimpleRNNCell(16, 32)
-            y, h = cell(x, prev_h)
-
-        ops = pir.check_unregistered_ops(main_program.desc)
-        assert len(ops) == 0
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/deprecated/ir/test_ir_embedding_eltwise_layernorm_fuse_pass.py b/test/deprecated/ir/test_ir_embedding_eltwise_layernorm_fuse_pass_deprecated.py
similarity index 99%
rename from test/deprecated/ir/test_ir_embedding_eltwise_layernorm_fuse_pass.py
rename to test/deprecated/ir/test_ir_embedding_eltwise_layernorm_fuse_pass_deprecated.py
index 6b2af9ace72bf..68c109120511e 100644
--- a/test/deprecated/ir/test_ir_embedding_eltwise_layernorm_fuse_pass.py
+++ b/test/deprecated/ir/test_ir_embedding_eltwise_layernorm_fuse_pass_deprecated.py
@@ -24,6 +24,8 @@
 from paddle import base
 from paddle.base import core
 
+paddle.enable_static()
+
 
 class EmbEltwiseLayerNormFusePassTest(PassTest):
     def setUp(self):
diff --git a/test/deprecated/ir/test_ir_fc_fuse_pass.py b/test/deprecated/ir/test_ir_fc_fuse_pass_deprecated.py
similarity index 98%
rename from test/deprecated/ir/test_ir_fc_fuse_pass.py
rename to test/deprecated/ir/test_ir_fc_fuse_pass_deprecated.py
index e7dde8b9f6c4c..79aa2f6efc9eb 100644
--- a/test/deprecated/ir/test_ir_fc_fuse_pass.py
+++ b/test/deprecated/ir/test_ir_fc_fuse_pass_deprecated.py
@@ -24,6 +24,8 @@
 from paddle import base
 from paddle.base import core
 
+paddle.enable_static()
+
 
 class FCFusePassTest(PassTest):
     def setUp(self):
diff --git a/test/deprecated/ir/test_ir_generate_pass.py b/test/deprecated/ir/test_ir_generate_pass_deprecated.py
similarity index 100%
rename from test/deprecated/ir/test_ir_generate_pass.py
rename to test/deprecated/ir/test_ir_generate_pass_deprecated.py
diff --git a/test/deprecated/ir/test_ir_graph_to_program_pass.py b/test/deprecated/ir/test_ir_graph_to_program_pass_deprecated.py
similarity index 100%
rename from test/deprecated/ir/test_ir_graph_to_program_pass.py
rename to test/deprecated/ir/test_ir_graph_to_program_pass_deprecated.py
diff --git a/test/deprecated/ir/test_ir_preln_residual_bias_fuse_pass.py b/test/deprecated/ir/test_ir_preln_residual_bias_fuse_pass_deprecated.py
similarity index 100%
rename from test/deprecated/ir/test_ir_preln_residual_bias_fuse_pass.py
rename to test/deprecated/ir/test_ir_preln_residual_bias_fuse_pass_deprecated.py
diff --git a/test/deprecated/ir/test_ir_skip_layernorm_pass.py b/test/deprecated/ir/test_ir_skip_layernorm_pass_deprecated.py
similarity index 100%
rename from test/deprecated/ir/test_ir_skip_layernorm_pass.py
rename to test/deprecated/ir/test_ir_skip_layernorm_pass_deprecated.py
diff --git a/test/deprecated/ir/test_ir_yolo_box_pass.py b/test/deprecated/ir/test_ir_yolo_box_pass_deprecated.py
similarity index 100%
rename from test/deprecated/ir/test_ir_yolo_box_pass.py
rename to test/deprecated/ir/test_ir_yolo_box_pass_deprecated.py
diff --git a/test/deprecated/ir/test_op_input_grad_semantic.py b/test/deprecated/ir/test_op_input_grad_semantic_deprecated.py
similarity index 100%
rename from test/deprecated/ir/test_op_input_grad_semantic.py
rename to test/deprecated/ir/test_op_input_grad_semantic_deprecated.py
diff --git a/test/deprecated/legacy_test/CMakeLists.txt b/test/deprecated/legacy_test/CMakeLists.txt
index 18891bc1cb65e..61046057f7c7a 100644
--- a/test/deprecated/legacy_test/CMakeLists.txt
+++ b/test/deprecated/legacy_test/CMakeLists.txt
@@ -33,17 +33,18 @@ set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS})
 list(APPEND MIXED_DIST_TEST_OPS test_simple_dist_transpiler)
 list(APPEND MIXED_DIST_TEST_OPS test_communicator_async)
 list(APPEND MIXED_DIST_TEST_OPS test_communicator_ps_gpu)
-list(APPEND MIXED_DIST_TEST_OPS test_communicator_geo)
+list(APPEND MIXED_DIST_TEST_OPS test_communicator_geo_deprecated)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_ascend)
 list(APPEND MIXED_DIST_TEST_OPS test_ascend_group)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_api_input)
 
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_base)
-list(APPEND MIXED_DIST_TEST_OPS test_fleet_base_2)
-list(APPEND MIXED_DIST_TEST_OPS test_fleet_base_3)
+list(APPEND MIXED_DIST_TEST_OPS test_fleet_base_deprecated)
+list(APPEND MIXED_DIST_TEST_OPS test_fleet_base_2_deprecated)
+list(APPEND MIXED_DIST_TEST_OPS test_fleet_base_3_deprecated)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_auto)
-list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_partitioner)
-list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_partitioner_gpt)
+list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_partitioner_deprecated)
+list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_partitioner_gpt_deprecated)
 list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_searcher)
 list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_reshard)
 list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_dist_tensor)
@@ -100,11 +101,11 @@ if(WIN32)
   list(REMOVE_ITEM TEST_OPS test_trainer_desc)
   list(REMOVE_ITEM TEST_OPS test_checkpoint_notify_op)
   list(REMOVE_ITEM TEST_OPS test_downpoursgd_deprecated)
-  list(REMOVE_ITEM TEST_OPS test_fleet)
-  list(REMOVE_ITEM TEST_OPS test_fleet_nocvm_1)
+  list(REMOVE_ITEM TEST_OPS test_fleet_deprecated)
+  list(REMOVE_ITEM TEST_OPS test_fleet_nocvm_1_deprecated)
   list(REMOVE_ITEM TEST_OPS test_fleet_rolemaker)
   list(REMOVE_ITEM TEST_OPS test_fleet_rolemaker_3)
-  list(REMOVE_ITEM TEST_OPS test_fleet_unitaccessor)
+  list(REMOVE_ITEM TEST_OPS test_fleet_unitaccessor_deprecated)
   list(REMOVE_ITEM TEST_OPS test_ps_dispatcher)
   list(REMOVE_ITEM TEST_OPS test_ir_memory_optimize_nlp)
   list(REMOVE_ITEM TEST_OPS test_nvprof)
@@ -115,7 +116,7 @@ endif()
 
 if(NOT WITH_DISTRIBUTE OR WIN32)
   # DISTRIBUTE related
-  list(REMOVE_ITEM TEST_OPS test_avoid_twice_initialization)
+  list(REMOVE_ITEM TEST_OPS test_avoid_twice_initialization_deprecated)
   list(REMOVE_ITEM TEST_OPS test_fleet_metric)
   list(REMOVE_ITEM TEST_OPS test_fleet_ps)
   list(REMOVE_ITEM TEST_OPS test_fleet_rolemaker_2)
@@ -130,7 +131,6 @@ endif()
 
 if(WIN32)
   list(REMOVE_ITEM TEST_OPS test_complex_matmul)
-  list(REMOVE_ITEM TEST_OPS test_ops_nms)
   list(REMOVE_ITEM TEST_OPS test_trt_convert_preln_residual_bias)
   list(REMOVE_ITEM TEST_OPS test_masked_multihead_attention_op)
   list(REMOVE_ITEM TEST_OPS test_fused_ec_moe_op)
@@ -162,8 +162,8 @@ if((NOT WITH_GPU) AND (NOT WITH_ROCM))
   # TODO(shenliang03): batch_fc_op support CPU device in future
   # TODO(Yancey1989): parallel dygraph support CPU device in future
   list(REMOVE_ITEM TEST_OPS test_fleet_base_single)
-  list(REMOVE_ITEM TEST_OPS test_auto_parallel_partitioner)
-  list(REMOVE_ITEM TEST_OPS test_auto_parallel_partitioner_gpt)
+  list(REMOVE_ITEM TEST_OPS test_auto_parallel_partitioner_deprecated)
+  list(REMOVE_ITEM TEST_OPS test_auto_parallel_partitioner_gpt_deprecated)
   list(REMOVE_ITEM TEST_OPS test_auto_parallel_searcher)
   list(REMOVE_ITEM TEST_OPS test_auto_parallel_reshard)
   list(REMOVE_ITEM TEST_OPS test_auto_parallel_dist_tensor)
@@ -200,8 +200,8 @@ list(REMOVE_ITEM TEST_OPS decorator_helper)
 
 if(APPLE)
   if(NOT WITH_DISTRIBUTE)
-    list(REMOVE_ITEM TEST_OPS test_desc_clone)
-    list(REMOVE_ITEM TEST_OPS test_program_code)
+    list(REMOVE_ITEM TEST_OPS test_desc_clone_deprecated)
+    list(REMOVE_ITEM TEST_OPS test_program_code_deprecated)
   endif()
   message(
     WARNING
@@ -398,11 +398,9 @@ function(parallel_bash_test_modules TARGET_NAME)
   endif()
 endfunction()
 
-list(REMOVE_ITEM TEST_OPS test_feed_data_check_shape_type)
+list(REMOVE_ITEM TEST_OPS test_feed_data_check_shape_type_deprecated)
 list(REMOVE_ITEM TEST_OPS test_fetch_lod_tensor_array)
-list(REMOVE_ITEM TEST_OPS test_data_norm_op)
-list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op)
-list(REMOVE_ITEM TEST_OPS test_nearest_interp_op)
+list(REMOVE_ITEM TEST_OPS test_data_norm_op_deprecated)
 list(REMOVE_ITEM TEST_OPS test_imperative_mnist_sorted_gradient)
 list(REMOVE_ITEM TEST_OPS test_imperative_mnist)
 list(REMOVE_ITEM TEST_OPS test_layers_deprecated)
@@ -413,7 +411,7 @@ list(REMOVE_ITEM TEST_OPS test_basic_gru_api)
 list(REMOVE_ITEM TEST_OPS test_basic_gru_unit_op)
 list(REMOVE_ITEM TEST_OPS test_basic_lstm_api)
 list(REMOVE_ITEM TEST_OPS test_basic_lstm_unit_op)
-list(REMOVE_ITEM TEST_OPS test_fuse_bn_act_pass)
+list(REMOVE_ITEM TEST_OPS test_fuse_bn_act_pass_deprecated)
 
 # disable this unittest temporarily
 list(REMOVE_ITEM TEST_OPS test_imperative_data_loader_exception)
@@ -428,6 +426,7 @@ endif()
 
 if(APPLE OR WIN32)
   list(REMOVE_ITEM TEST_OPS test_dataset)
+  list(REMOVE_ITEM TEST_OPS test_dataset_deprecated)
   list(REMOVE_ITEM TEST_OPS test_dataset_dataloader)
   list(REMOVE_ITEM TEST_OPS test_imperative_data_loader_process)
   list(REMOVE_ITEM TEST_OPS test_imperative_data_loader_exit_func)
@@ -452,8 +451,8 @@ endif()
 
 # Some ops need to check results when gc is enabled
 # Currently, only ops that register NoNeedBufferVarsInference need to do this test
-set(TEST_OPS_WITH_GC test_affine_channel_op test_gather_nd_op test_scatter_op
-                     test_slice_op)
+set(TEST_OPS_WITH_GC test_gather_nd_op test_slice_op)
+set(TEST_OPS_WITH_GC test_gather_nd_op test_slice_op_deprecated)
 
 foreach(TEST_OP ${TEST_OPS_WITH_GC})
   list(REMOVE_ITEM TEST_OPS ${TEST_OP})
@@ -485,10 +484,6 @@ set_tests_properties(test_logcumsumexp_op PROPERTIES TIMEOUT 30)
 py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS
                 FLAGS_inner_op_parallelism=4)
 
-py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op ENVS
-                ${GC_ENVS})
-py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op ENVS
-                ${GC_ENVS})
 py_test_modules(test_imperative_mnist MODULES test_imperative_mnist ENVS
                 FLAGS_cudnn_deterministic=1)
 py_test_modules(
@@ -511,11 +506,8 @@ if((WITH_GPU) AND (WITH_CUDNN_FRONTEND))
                   test_fused_dot_product_attention_op)
 endif()
 
-set_tests_properties(test_conv2d_op_depthwise_conv
-                     PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
 set_tests_properties(test_conv2d_api_deprecated PROPERTIES LABELS
                                                            "RUN_TYPE=EXCLUSIVE")
-set_tests_properties(test_conv_nn_grad PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
 if(WITH_DISTRIBUTE)
   # FIXME(typhoonzero): add these tests back
   list(REMOVE_ITEM DIST_TEST_OPS "test_dist_transformer")
@@ -543,19 +535,23 @@ if(WITH_DISTRIBUTE)
 
   py_test_modules(test_communicator_async MODULES test_communicator_async ENVS
                   ${dist_ENVS})
-  py_test_modules(test_communicator_geo MODULES test_communicator_geo ENVS
-                  ${dist_ENVS})
+  py_test_modules(test_communicator_geo_deprecated MODULES
+                  test_communicator_geo_deprecated ENVS ${dist_ENVS})
   if(NOT APPLE)
     py_test_modules(test_fleet_base MODULES test_fleet_base ENVS ${dist_ENVS})
-    py_test_modules(test_fleet_base_2 MODULES test_fleet_base_2 ENVS
-                    ${dist_ENVS})
-    py_test_modules(test_fleet_base_3 MODULES test_fleet_base_3 ENVS
-                    ${dist_ENVS})
+    py_test_modules(test_fleet_base_deprecated MODULES
+                    test_fleet_base_deprecated ENVS ${dist_ENVS})
+    py_test_modules(test_fleet_base_2_deprecated MODULES
+                    test_fleet_base_2_deprecated ENVS ${dist_ENVS})
+    py_test_modules(test_fleet_base_3_deprecated MODULES
+                    test_fleet_base_3_deprecated ENVS ${dist_ENVS})
     if(NOT WIN32)
-      py_test_modules(test_auto_parallel_partitioner MODULES
-                      test_auto_parallel_partitioner ENVS ${dist_ENVS})
-      py_test_modules(test_auto_parallel_partitioner_gpt MODULES
-                      test_auto_parallel_partitioner_gpt ENVS ${dist_ENVS})
+      py_test_modules(
+        test_auto_parallel_partitioner_deprecated MODULES
+        test_auto_parallel_partitioner_deprecated ENVS ${dist_ENVS})
+      py_test_modules(
+        test_auto_parallel_partitioner_gpt_deprecated MODULES
+        test_auto_parallel_partitioner_gpt_deprecated ENVS ${dist_ENVS})
       py_test_modules(test_auto_parallel_searcher MODULES
                       test_auto_parallel_searcher ENVS ${dist_ENVS})
       py_test_modules(test_auto_parallel_reshard MODULES
@@ -603,22 +599,24 @@ if(WITH_DISTRIBUTE)
 endif()
 
 if(WIN32)
-  py_test_modules(test_feed_data_check_shape_type MODULES
-                  test_feed_data_check_shape_type ENVS CUDA_VISIBLE_DEVICES=0)
+  py_test_modules(
+    test_feed_data_check_shape_type_deprecated MODULES
+    test_feed_data_check_shape_type_deprecated ENVS CUDA_VISIBLE_DEVICES=0)
   py_test_modules(test_fetch_lod_tensor_array MODULES
                   test_fetch_lod_tensor_array ENVS CUDA_VISIBLE_DEVICES=0)
 else()
-  py_test_modules(test_feed_data_check_shape_type MODULES
-                  test_feed_data_check_shape_type)
+  py_test_modules(test_feed_data_check_shape_type_deprecated MODULES
+                  test_feed_data_check_shape_type_deprecated)
   py_test_modules(test_fetch_lod_tensor_array MODULES
                   test_fetch_lod_tensor_array)
 endif()
 
-py_test_modules(test_data_norm_op MODULES test_data_norm_op)
+py_test_modules(test_data_norm_op_deprecated MODULES
+                test_data_norm_op_deprecated)
 py_test_modules(
-  test_fuse_bn_act_pass
+  test_fuse_bn_act_pass_deprecated
   MODULES
-  test_fuse_bn_act_pass
+  test_fuse_bn_act_pass_deprecated
   ENVS
   FLAGS_cudnn_deterministic=1
   FLAGS_cudnn_batchnorm_spatial_persistent=1
@@ -631,8 +629,8 @@ if(NOT WIN32)
 endif()
 
 set_tests_properties(
-  test_data_norm_op test_dataloader_keep_order test_dataloader_unkeep_order
-  PROPERTIES LABELS "RUN_TYPE=DIST")
+  test_data_norm_op_deprecated test_dataloader_keep_order_deprecated
+  test_dataloader_unkeep_order_deprecated PROPERTIES LABELS "RUN_TYPE=DIST")
 
 if(NOT WIN32)
   set_tests_properties(test_multiprocess_reader_exception
@@ -642,99 +640,68 @@ endif()
 
 # setting timeout value as 15S
 set_tests_properties(test_cross_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_imperative_lod_tensor_to_selected_rows
+set_tests_properties(test_imperative_lod_tensor_to_selected_rows_deprecated
                      PROPERTIES TIMEOUT 200)
-set_tests_properties(test_lstm_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_imperative_star_gan_with_gradient_penalty
-                     PROPERTIES TIMEOUT 120)
 
-set_tests_properties(test_bicubic_interp_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_deformable_conv_op PROPERTIES TIMEOUT 200)
-set_tests_properties(test_nearest_interp_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_deformable_conv_op_deprecated PROPERTIES TIMEOUT 200)
+set_tests_properties(test_regularizer_api_deprecated PROPERTIES TIMEOUT 150)
 set_tests_properties(test_inplace_softmax_with_cross_entropy PROPERTIES TIMEOUT
                                                                         120)
-set_tests_properties(test_cross_entropy2_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_gru_unit_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_regularizer_api PROPERTIES TIMEOUT 150)
 if(NOT WIN32)
   if(WITH_NV_JETSON)
     set_tests_properties(test_ir_memory_optimize_nlp PROPERTIES TIMEOUT 1200)
   endif()
 endif()
-set_tests_properties(test_add_reader_dependency PROPERTIES TIMEOUT 120)
+set_tests_properties(test_add_reader_dependency_deprecated PROPERTIES TIMEOUT
+                                                                      120)
 set_tests_properties(test_bilateral_slice_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_fleet_util PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_transformer_sorted_gradient
                      PROPERTIES TIMEOUT 120)
-set_tests_properties(test_matmul_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_nearest_interp_v2_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_trilinear_interp_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_static_save_load PROPERTIES TIMEOUT 250)
-set_tests_properties(test_paddle_save_load_binary PROPERTIES TIMEOUT 120)
-if(WIN32)
-  set_tests_properties(test_static_save_load_large PROPERTIES TIMEOUT 900)
-else()
-  set_tests_properties(test_static_save_load_large PROPERTIES TIMEOUT 600)
-endif()
 if(WITH_NV_JETSON)
-  set_tests_properties(test_conv3d_transpose_part2_op PROPERTIES TIMEOUT 1200)
-  set_tests_properties(test_layer_norm_op PROPERTIES TIMEOUT 1500)
-  set_tests_properties(test_pool3d_op PROPERTIES TIMEOUT 1500)
+  set_tests_properties(test_conv3d_transpose_part2_op_deprecated
+                       PROPERTIES TIMEOUT 1200)
+  set_tests_properties(test_layer_norm_op_deprecated PROPERTIES TIMEOUT 1500)
 else()
-  set_tests_properties(test_conv3d_transpose_part2_op PROPERTIES TIMEOUT 120)
-  set_tests_properties(test_layer_norm_op PROPERTIES TIMEOUT 250)
-  set_tests_properties(test_pool3d_op PROPERTIES TIMEOUT 150)
+  set_tests_properties(test_conv3d_transpose_part2_op_deprecated
+                       PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_layer_norm_op_deprecated PROPERTIES TIMEOUT 250)
 endif()
 
 set_tests_properties(test_imperative_selected_rows_to_lod_tensor
                      PROPERTIES TIMEOUT 200)
-set_tests_properties(test_index_select_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_index_add_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_argsort_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_argsort_op_deprecated PROPERTIES TIMEOUT 120)
 set_tests_properties(test_gather_nd_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_row_conv_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_deformable_conv_v1_op PROPERTIES TIMEOUT 300)
 set_tests_properties(test_imperative_ptb_rnn_sorted_gradient PROPERTIES TIMEOUT
                                                                         120)
-set_tests_properties(test_crop_tensor_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_ptb_rnn PROPERTIES TIMEOUT 120)
-set_tests_properties(test_svd_op PROPERTIES TIMEOUT 80)
-set_tests_properties(test_qr_op PROPERTIES TIMEOUT 60)
-set_tests_properties(test_trilinear_interp_v2_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_masked_select_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_sigmoid_cross_entropy_with_logits_op
                      PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_optimizer_v2 PROPERTIES TIMEOUT 150)
-set_tests_properties(test_partial_sum_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_sgd_op PROPERTIES TIMEOUT 250)
+set_tests_properties(test_sgd_op_deprecated PROPERTIES TIMEOUT 250)
 set_tests_properties(test_generator_dataloader_deprecated PROPERTIES TIMEOUT
                                                                      120)
-set_tests_properties(test_partial_concat_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_reduce_op PROPERTIES TIMEOUT 500)
-set_tests_properties(test_conv_nn_grad PROPERTIES TIMEOUT 220)
-set_tests_properties(test_program_prune_backward PROPERTIES TIMEOUT 120)
+set_tests_properties(test_program_prune_backward_deprecated PROPERTIES TIMEOUT
+                                                                       120)
 set_tests_properties(test_imperative_optimizer_v2 PROPERTIES TIMEOUT 250)
-set_tests_properties(test_bilinear_interp_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_decoupled_py_reader PROPERTIES TIMEOUT 120)
-set_tests_properties(test_fuse_bn_act_pass PROPERTIES TIMEOUT 120)
-set_tests_properties(test_conv2d_op_depthwise_conv PROPERTIES TIMEOUT 120)
+set_tests_properties(test_decoupled_py_reader_deprecated PROPERTIES TIMEOUT 120)
+set_tests_properties(test_fuse_bn_act_pass_deprecated PROPERTIES TIMEOUT 120)
 set_tests_properties(test_conv2d_api_deprecated PROPERTIES TIMEOUT 120)
-set_tests_properties(test_elementwise_mul_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_dygraph_multi_forward PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_ocr_attention_model PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_mnist PROPERTIES TIMEOUT 120)
-set_tests_properties(test_gru_op PROPERTIES TIMEOUT 200)
-set_tests_properties(test_regularizer PROPERTIES TIMEOUT 150)
-set_tests_properties(test_matmul_v2_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_regularizer_deprecated PROPERTIES TIMEOUT 150)
 set_tests_properties(test_slice_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_pad3d_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_dataloader_keep_order PROPERTIES TIMEOUT 120)
-set_tests_properties(test_dataloader_unkeep_order PROPERTIES TIMEOUT 120)
+set_tests_properties(test_slice_op_deprecated PROPERTIES TIMEOUT 120)
+set_tests_properties(test_dataloader_keep_order_deprecated PROPERTIES TIMEOUT
+                                                                      120)
+set_tests_properties(test_dataloader_unkeep_order_deprecated PROPERTIES TIMEOUT
+                                                                        120)
 set_tests_properties(test_reader_reset PROPERTIES TIMEOUT 120)
-set_tests_properties(test_cumprod_op PROPERTIES TIMEOUT 300)
 set_tests_properties(test_split_program PROPERTIES TIMEOUT 120)
-set_tests_properties(test_graph_send_ue_recv_op PROPERTIES TIMEOUT 60)
-set_tests_properties(test_graph_send_uv_op PROPERTIES TIMEOUT 60)
 set_tests_properties(test_uniform_random_op_deprecated PROPERTIES TIMEOUT 60)
 
 set_tests_properties(test_pretrained_model PROPERTIES TIMEOUT 120)
@@ -756,36 +723,19 @@ set_tests_properties(test_inplace_addto_strategy_deprecated PROPERTIES TIMEOUT
 
 set(TEST_CINN_OPS
     test_softmax_op
-    test_expand_v2_op
     test_reduce_op
     test_slice_op
-    test_full_like_op
-    test_index_select_op
-    test_top_k_v2_op
-    test_elementwise_mul_op
+    test_slice_op_deprecated
     test_gather_nd_op
-    test_elementwise_pow_op
-    test_reshape_op
-    test_meshgrid_op
     test_scale_op
-    test_scatter_op
-    test_layer_norm_op
-    test_cast_op
-    test_roll_op
-    test_atan2_op
-    test_top_k_op
+    test_layer_norm_op_deprecated
     test_where_op
     test_arg_min_max_op
-    test_reverse_op
-    test_flip
-    test_triangular_solve_op
     test_scatter_nd_op
     test_instance_norm_op
+    test_instance_norm_op_deprecated
     test_cumsum_op
-    test_split_op
-    test_erf_op
-    test_assign_op
-    test_flatten_contiguous_range_op)
+    test_erf_op)
 
 foreach(TEST_CINN_OP ${TEST_CINN_OPS})
   if(WITH_CINN)
@@ -798,32 +748,31 @@ foreach(TEST_CINN_OP ${TEST_CINN_OPS})
   endif()
 endforeach()
 
-# In test_conditional_block, the sub block changes the dtype and place of the output variable.
+# In test_conditional_block_deprecated, the sub block changes the dtype and place of the output variable.
 # The changed variable is used in the following op. Static build is not supported for this case.
-set_tests_properties(test_conditional_block
+set_tests_properties(test_conditional_block_deprecated
                      PROPERTIES ENVIRONMENT "FLAGS_new_executor_static_build=0")
 
 # These UTs are to temporarily test static build for standalone_executor, will be removed after static build is enabled by default.
 set(STATIC_BUILD_TESTS
     test_adamw_op
+    test_adamw_op_deprecated
     test_arg_min_max_op
-    test_batch_norm_op
+    test_batch_norm_op_deprecated
     test_bincount_op
-    test_decoupled_py_reader
-    test_eigh_op
+    test_decoupled_py_reader_deprecated
     test_fetch_lod_tensor_array
-    test_fuse_bn_act_pass
-    test_layer_norm_op
+    test_fuse_bn_act_pass_deprecated
+    test_layer_norm_op_deprecated
     test_lookup_table_v2_op_deprecated
-    test_matmul_op
-    test_matmul_v2_op
     test_momentum_op
-    test_nce
-    test_paddle_save_load_binary
+    test_momentum_op_deprecated
+    test_nce_deprecated
     test_reduce_op
     test_sparse_conv_op
     test_sparse_norm_op
     test_tensor_array_to_tensor
+    test_tensor_array_to_tensor_deprecated
     test_unique
     test_one_hot_v2_op)
 
@@ -854,20 +803,18 @@ foreach(PIR_COVERAGE_TEST ${PIR_COVERAGE_TESTS})
   message(STATUS "PIR Copied OpTest: ${PIR_COVERAGE_TEST}_pir in legacy_test")
 endforeach()
 
-set_tests_properties(test_decoupled_py_reader_static_build PROPERTIES TIMEOUT
-                                                                      120)
-set_tests_properties(test_fuse_bn_act_pass_static_build PROPERTIES TIMEOUT 120)
+set_tests_properties(test_decoupled_py_reader_deprecated_static_build
+                     PROPERTIES TIMEOUT 120)
+set_tests_properties(test_fuse_bn_act_pass_deprecated_static_build
+                     PROPERTIES TIMEOUT 120)
 set_tests_properties(
-  test_fuse_bn_act_pass_static_build
+  test_fuse_bn_act_pass_deprecated_static_build
   PROPERTIES
     ENVIRONMENT
     "FLAGS_cudnn_deterministic=1;FLAGS_cudnn_batchnorm_spatial_persistent=1;FLAGS_conv_workspace_size_limit=1000"
 )
-set_tests_properties(test_matmul_op_static_build PROPERTIES TIMEOUT 120)
-set_tests_properties(test_matmul_v2_op_static_build PROPERTIES TIMEOUT 120)
-set_tests_properties(test_layer_norm_op_static_build PROPERTIES TIMEOUT 1500)
-set_tests_properties(test_paddle_save_load_binary_static_build
-                     PROPERTIES TIMEOUT 120)
+set_tests_properties(test_layer_norm_op_deprecated_static_build
+                     PROPERTIES TIMEOUT 1500)
 set_tests_properties(test_reduce_op_static_build PROPERTIES TIMEOUT 500)
 py_test_modules(test_stride MODULES test_stride ENVS
                 FLAGS_use_stride_kernel=true)
@@ -875,6 +822,6 @@ py_test_modules(test_stride MODULES test_stride ENVS
 set_tests_properties(test_linalg_matrix_exp PROPERTIES TIMEOUT 120)
 set_pir_tests_properties()
 
-set_tests_properties(test_fractional_max_pool2d_op PROPERTIES TIMEOUT 120)
-
 set_tests_properties(test_reduce_as_op PROPERTIES TIMEOUT 30)
+set_tests_properties(test_attribute_var_deprecated PROPERTIES TIMEOUT 100)
+set_tests_properties(test_inference_api_deprecated PROPERTIES TIMEOUT 100)
diff --git a/test/deprecated/legacy_test/dist_test.sh b/test/deprecated/legacy_test/dist_test.sh
index 69a893a7ddc13..3ae7b209f4a00 100644
--- a/test/deprecated/legacy_test/dist_test.sh
+++ b/test/deprecated/legacy_test/dist_test.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -82,7 +82,7 @@ if [[ $exit_code -ne 0 ]]; then
 fi
 
 #display system context
-for i in {1..2}; do 
+for i in {1..2}; do
     sleep 3
     ps -aux
     netstat -anlp
diff --git a/test/deprecated/legacy_test/run_server_for_communicator_geo.py b/test/deprecated/legacy_test/run_server_for_communicator_geo.py
index c384459a0ffbc..c8a7ed8f8373e 100644
--- a/test/deprecated/legacy_test/run_server_for_communicator_geo.py
+++ b/test/deprecated/legacy_test/run_server_for_communicator_geo.py
@@ -13,8 +13,12 @@
 # limitations under the License.
 
 import os
+import sys
 
-from test_communicator_geo import TestCommunicatorGeoEnd2End
+sys.path.append(".")
+from test_communicator_geo_deprecated import (
+    TestCommunicatorGeoEnd2End,
+)
 
 import paddle
 
diff --git a/test/deprecated/legacy_test/test_adam_op.py b/test/deprecated/legacy_test/test_adam_op.py
index 0693d4f664356..50caa25457671 100644
--- a/test/deprecated/legacy_test/test_adam_op.py
+++ b/test/deprecated/legacy_test/test_adam_op.py
@@ -648,39 +648,6 @@ def test_check_output(self):
 
 
 class TestAdamOpV2(unittest.TestCase):
-    def test_adam_op(self):
-        place = base.CPUPlace()
-        shape = [2, 3, 8, 8]
-        exe = base.Executor(place)
-        train_prog = base.Program()
-        startup = base.Program()
-        with base.program_guard(train_prog, startup):
-            with base.unique_name.guard():
-                data = paddle.static.data(name="data", shape=shape)
-                conv = paddle.static.nn.conv2d(data, 8, 3)
-                loss = paddle.mean(conv)
-
-                beta1 = paddle.static.create_global_var(
-                    shape=[1], value=0.85, dtype='float32', persistable=True
-                )
-                beta2 = paddle.static.create_global_var(
-                    shape=[1], value=0.95, dtype='float32', persistable=True
-                )
-                betas = [beta1, beta2]
-                opt = paddle.optimizer.Adam(
-                    learning_rate=1e-5,
-                    beta1=beta1,
-                    beta2=beta2,
-                    weight_decay=0.01,
-                    epsilon=1e-8,
-                )
-                opt.minimize(loss)
-
-        exe.run(startup)
-        data_np = np.random.random(shape).astype('float32')
-        rets = exe.run(train_prog, feed={"data": data_np}, fetch_list=[loss])
-        assert rets[0] is not None
-
     def test_pir_adam_op(self):
         with paddle.pir_utils.IrGuard():
             place = base.CPUPlace()
diff --git a/test/deprecated/legacy_test/test_adam_op_deprecated.py b/test/deprecated/legacy_test/test_adam_op_deprecated.py
new file mode 100644
index 0000000000000..357d2f9b438a4
--- /dev/null
+++ b/test/deprecated/legacy_test/test_adam_op_deprecated.py
@@ -0,0 +1,62 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle import base
+
+paddle.enable_static()
+
+
+class TestAdamOpV2(unittest.TestCase):
+    def test_adam_op(self):
+        place = base.CPUPlace()
+        shape = [2, 3, 8, 8]
+        exe = base.Executor(place)
+        train_prog = base.Program()
+        startup = base.Program()
+        with base.program_guard(train_prog, startup):
+            with base.unique_name.guard():
+                data = paddle.static.data(name="data", shape=shape)
+                conv = paddle.static.nn.conv2d(data, 8, 3)
+                loss = paddle.mean(conv)
+
+                beta1 = paddle.static.create_global_var(
+                    shape=[1], value=0.85, dtype='float32', persistable=True
+                )
+                beta2 = paddle.static.create_global_var(
+                    shape=[1], value=0.95, dtype='float32', persistable=True
+                )
+                betas = [beta1, beta2]
+                opt = paddle.optimizer.Adam(
+                    learning_rate=1e-5,
+                    beta1=beta1,
+                    beta2=beta2,
+                    weight_decay=0.01,
+                    epsilon=1e-8,
+                )
+                opt.minimize(loss)
+
+        exe.run(startup)
+        data_np = np.random.random(shape).astype('float32')
+        rets = exe.run(train_prog, feed={"data": data_np}, fetch_list=[loss])
+        assert rets[0] is not None
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_adamax_api_deprecated.py b/test/deprecated/legacy_test/test_adamax_api_deprecated.py
new file mode 100644
index 0000000000000..6f1d806be7eea
--- /dev/null
+++ b/test/deprecated/legacy_test/test_adamax_api_deprecated.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle import base
+
+
+class TestAdamaxAPI(unittest.TestCase):
+    def test_adamax_api(self):
+        paddle.enable_static()
+        place = base.CPUPlace()
+        shape = [2, 3, 8, 8]
+        exe = base.Executor(place)
+        train_prog = base.Program()
+        startup = base.Program()
+        with base.program_guard(train_prog, startup):
+            with base.unique_name.guard():
+                data = paddle.static.data(name="data", shape=shape)
+                conv = paddle.static.nn.conv2d(data, 8, 3)
+                loss = paddle.mean(conv)
+                beta1 = 0.85
+                beta2 = 0.95
+                opt = paddle.optimizer.Adamax(
+                    learning_rate=1e-5,
+                    beta1=beta1,
+                    beta2=beta2,
+                    weight_decay=0.01,
+                    epsilon=1e-8,
+                )
+                opt.minimize(loss)
+
+        exe.run(startup)
+        data_np = np.random.random(shape).astype('float32')
+        rets = exe.run(train_prog, feed={"data": data_np}, fetch_list=[loss])
+        assert rets[0] is not None
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_adamw_op.py b/test/deprecated/legacy_test/test_adamw_op.py
index 1c901e8d4baf5..53465c1dff73b 100644
--- a/test/deprecated/legacy_test/test_adamw_op.py
+++ b/test/deprecated/legacy_test/test_adamw_op.py
@@ -262,41 +262,6 @@ def test_adamw_op_coverage(self):
         )
         assert adam.__str__() is not None
 
-    def test_adamw_op(self):
-        paddle.enable_static()
-        place = base.CPUPlace()
-        shape = [2, 3, 8, 8]
-        exe = base.Executor(place)
-        train_prog = base.Program()
-        startup = base.Program()
-        with base.program_guard(train_prog, startup):
-            with base.unique_name.guard():
-                data = paddle.static.data(name="data", shape=shape)
-                conv = paddle.static.nn.conv2d(data, 8, 3)
-                loss = paddle.mean(conv)
-
-                beta1 = paddle.static.create_global_var(
-                    shape=[1], value=0.85, dtype='float32', persistable=True
-                )
-                beta2 = paddle.static.create_global_var(
-                    shape=[1], value=0.95, dtype='float32', persistable=True
-                )
-                betas = [beta1, beta2]
-                opt = paddle.optimizer.AdamW(
-                    learning_rate=1e-5,
-                    beta1=beta1,
-                    beta2=beta2,
-                    weight_decay=0.01,
-                    epsilon=1e-8,
-                )
-                opt.minimize(loss)
-
-        exe.run(startup)
-        data_np = np.random.random(shape).astype('float32')
-        rets = exe.run(train_prog, feed={"data": data_np}, fetch_list=[loss])
-        assert rets[0] is not None
-        paddle.disable_static()
-
     def test_pir_adam_op(self):
         with paddle.pir_utils.IrGuard():
             place = base.CPUPlace()
diff --git a/test/deprecated/legacy_test/test_adamw_op_deprecated.py b/test/deprecated/legacy_test/test_adamw_op_deprecated.py
new file mode 100644
index 0000000000000..c5f5aa5453bbf
--- /dev/null
+++ b/test/deprecated/legacy_test/test_adamw_op_deprecated.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle import base
+
+
+class TestAdamWOp(unittest.TestCase):
+    def test_adamw_op(self):
+        paddle.enable_static()
+        place = base.CPUPlace()
+        shape = [2, 3, 8, 8]
+        exe = base.Executor(place)
+        train_prog = base.Program()
+        startup = base.Program()
+        with base.program_guard(train_prog, startup):
+            with base.unique_name.guard():
+                data = paddle.static.data(name="data", shape=shape)
+                conv = paddle.static.nn.conv2d(data, 8, 3)
+                loss = paddle.mean(conv)
+
+                beta1 = paddle.static.create_global_var(
+                    shape=[1], value=0.85, dtype='float32', persistable=True
+                )
+                beta2 = paddle.static.create_global_var(
+                    shape=[1], value=0.95, dtype='float32', persistable=True
+                )
+                betas = [beta1, beta2]
+                opt = paddle.optimizer.AdamW(
+                    learning_rate=1e-5,
+                    beta1=beta1,
+                    beta2=beta2,
+                    weight_decay=0.01,
+                    epsilon=1e-8,
+                )
+                opt.minimize(loss)
+
+        exe.run(startup)
+        data_np = np.random.random(shape).astype('float32')
+        rets = exe.run(train_prog, feed={"data": data_np}, fetch_list=[loss])
+        assert rets[0] is not None
+        paddle.disable_static()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_adaptive_avg_pool2d.py b/test/deprecated/legacy_test/test_adaptive_avg_pool2d.py
index 880a7cf949a62..5ed16ca8675b1 100644
--- a/test/deprecated/legacy_test/test_adaptive_avg_pool2d.py
+++ b/test/deprecated/legacy_test/test_adaptive_avg_pool2d.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from test_attribute_var import UnittestBase
+from test_attribute_var_deprecated import UnittestBase
 
 import paddle
 from paddle.base import Program, core, program_guard
diff --git a/test/deprecated/legacy_test/test_add_reader_dependency.py b/test/deprecated/legacy_test/test_add_reader_dependency_deprecated.py
similarity index 99%
rename from test/deprecated/legacy_test/test_add_reader_dependency.py
rename to test/deprecated/legacy_test/test_add_reader_dependency_deprecated.py
index 9dfd4a500d1f7..e844d26679dc2 100644
--- a/test/deprecated/legacy_test/test_add_reader_dependency.py
+++ b/test/deprecated/legacy_test/test_add_reader_dependency_deprecated.py
@@ -21,6 +21,8 @@
 from paddle import base
 from paddle.base.layer_helper import LayerHelper
 
+paddle.enable_static()
+
 
 def inplace_add(x, bias):
     helper = LayerHelper('scale', **locals())
diff --git a/test/deprecated/legacy_test/test_arg_min_max_op.py b/test/deprecated/legacy_test/test_arg_min_max_op.py
index c35fa9f8f7d39..69b98997aeed5 100644
--- a/test/deprecated/legacy_test/test_arg_min_max_op.py
+++ b/test/deprecated/legacy_test/test_arg_min_max_op.py
@@ -17,7 +17,7 @@
 
 import numpy as np
 from op_test import OpTest, convert_float_to_uint16
-from test_attribute_var import UnittestBase
+from test_attribute_var_deprecated import UnittestBase
 
 import paddle
 from paddle.base import Program, program_guard
diff --git a/test/deprecated/legacy_test/test_argsort_op_deprecated.py b/test/deprecated/legacy_test/test_argsort_op_deprecated.py
new file mode 100644
index 0000000000000..96cd761267082
--- /dev/null
+++ b/test/deprecated/legacy_test/test_argsort_op_deprecated.py
@@ -0,0 +1,355 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle import base
+from paddle.base import core
+from paddle.base.backward import append_backward
+from paddle.base.executor import Executor
+from paddle.base.framework import Program, grad_var_name
+
+np.random.seed(123)
+paddle.enable_static()
+
+
+class PyArgsort:
+    def __init__(self, input_shape, axis, descending, dtype):
+        self.x = np.random.random(input_shape).astype(dtype)
+        self.label = np.random.random(input_shape).astype(dtype)
+        if axis < 0:
+            self.axis = axis + len(self.x.shape)
+        else:
+            self.axis = axis
+        self.descending = descending
+
+    def forward(self):
+        if self.descending:
+            self.indices = np.flip(
+                np.argsort(self.x, kind='quicksort', axis=self.axis), self.axis
+            )
+            self.sorted_x = np.flip(
+                np.sort(self.x, kind='quicksort', axis=self.axis), self.axis
+            )
+        else:
+            self.indices = np.argsort(self.x, kind='quicksort', axis=self.axis)
+            self.sorted_x = np.sort(self.x, kind='quicksort', axis=self.axis)
+        self.loss = self.sorted_x * self.label
+        self.loss = np.sum(self.loss)
+        out = (
+            np.array(self.indices, dtype=self.indices.dtype),
+            np.array(self.sorted_x, dtype=self.sorted_x.dtype),
+            np.array(self.loss, dtype=self.loss.dtype),
+        )
+        return out
+
+
+def create_tensor(np_data, place):
+    tensor = core.LoDTensor()
+    tensor.set(np_data, place)
+    return tensor
+
+
+class TestArgsortOpCPU(unittest.TestCase):
+    def setup_program(self):
+        self.main_program = Program()
+        self.startup_program = Program()
+        self.init_place()
+
+    def setUp(self):
+        paddle.enable_static()
+        self.init_axis()
+        self.init_datatype()
+        self.init_direction()
+        self.init_inputshape()
+
+        self.setup_program()
+        self.feed_data_field = {"x", "label"}
+        self.grad_data_field = {"x"}
+
+        self.py_argsort = PyArgsort(
+            self.input_shape, self.axis, self.descending, self.dtype
+        )
+
+        with base.program_guard(self.main_program, self.startup_program):
+            x = paddle.static.data(
+                name="x", shape=[-1] + list(self.input_shape), dtype=self.dtype
+            )
+            x.stop_gradient = False
+            x.desc.set_need_check_feed(False)
+            label = paddle.static.data(
+                name="label",
+                shape=[-1] + list(self.input_shape),
+                dtype=self.dtype,
+            )
+            label.desc.set_need_check_feed(False)
+            self.index = paddle.argsort(
+                x=x, axis=self.axis, descending=self.descending
+            )
+            self.sorted_x = paddle.sort(
+                x=x, axis=self.axis, descending=self.descending
+            )
+            self.sorted_x.stop_gradient = False
+            loss = paddle.multiply(self.sorted_x, label)
+            self.loss = paddle.sum(loss)
+
+    def forward(self):
+        self.feed_map = {
+            x: create_tensor(getattr(self.py_argsort, x), self.place)
+            for x in self.feed_data_field
+        }
+        exe = Executor(self.place)
+        out = exe.run(
+            self.main_program,
+            feed=self.feed_map,
+            fetch_list=[self.index, self.sorted_x, self.loss],
+        )
+        return out
+
+    def backward(self):
+        self.feed_map = {
+            x: create_tensor(getattr(self.py_argsort, x), self.place)
+            for x in self.feed_data_field
+        }
+        fetch_list = [
+            self.main_program.global_block().var(grad_var_name(x))
+            for x in self.grad_data_field
+        ]
+        exe = Executor(self.place)
+        out = exe.run(
+            self.main_program,
+            feed=self.feed_map,
+            fetch_list=fetch_list,
+            return_numpy=False,
+        )
+        return out
+
+    def test_backward(self, numeric_grad_delta=1e-5, max_relative_error=1e-7):
+        self.check_forward()
+
+        with base.program_guard(self.main_program, self.startup_program):
+            append_backward(self.loss)
+
+        ana_grad = [np.array(x) for x in self.backward()]
+
+        num_grad = self.get_numerical_gradient(delta=numeric_grad_delta)
+        self.assert_is_close(
+            num_grad,
+            ana_grad,
+            'x',
+            max_relative_error=max_relative_error,
+            msg_prefix="Gradient Check On %s" % str(self.place),
+        )
+
+    def check_forward(self):
+        pd_outputs = self.forward()
+        py_outputs = self.py_argsort.forward()
+        for pd_output, py_output in zip(pd_outputs, py_outputs):
+            self.assertEqual(pd_output.shape, py_output.shape)
+            np.testing.assert_allclose(
+                pd_output, py_output, rtol=1e-05, atol=0, equal_nan=False
+            )
+
+    def get_numerical_gradient(self, delta=1e-7):
+        if self.dtype == 'float16':
+            delta = np.array(delta).astype(np.float16)
+        feed_list = [getattr(self.py_argsort, x) for x in self.grad_data_field]
+        grad_list = [np.zeros_like(x) for x in feed_list]
+        for feed, grad in zip(feed_list, grad_list):
+            for f, g in np.nditer([feed, grad], op_flags=['readwrite']):
+                o = float(f)
+                f[...] = o + delta
+                y_pos = self.forward()[2]
+
+                f[...] = o - delta
+                y_neg = self.forward()[2]
+
+                f[...] = o
+                dout_dfeed = (y_pos - y_neg) / (delta * 2)
+                g[...] = dout_dfeed
+
+        return grad_list
+
+    def assert_is_close(
+        self,
+        numeric_grads,
+        analytic_grads,
+        names,
+        max_relative_error,
+        msg_prefix,
+    ):
+        for a, b, name in zip(numeric_grads, analytic_grads, names):
+            abs_a = np.abs(a)
+            abs_a[abs_a < 1e-3] = 1
+
+            diff_mat = np.abs(a - b) / abs_a
+            max_diff = np.max(diff_mat)
+
+            def err_msg():
+                offset = np.argmax(diff_mat > max_relative_error)
+                return (
+                    "%s error, %s variable %s max gradient diff %f over limit %f, "
+                    "the first error element is %d, expected %f, but got %f."
+                ) % (
+                    'argsort',
+                    msg_prefix,
+                    name,
+                    max_diff,
+                    max_relative_error,
+                    offset,
+                    a.flatten()[offset],
+                    b.flatten()[offset],
+                )
+
+            self.assertLessEqual(max_diff, max_relative_error, err_msg())
+
+    def init_axis(self):
+        self.axis = -1
+
+    def init_datatype(self):
+        self.dtype = "float64"
+
+    def init_direction(self):
+        self.descending = False
+
+    def init_inputshape(self):
+        self.input_shape = (2, 2, 2, 2, 3)
+
+    def init_place(self):
+        self.place = core.CPUPlace()
+
+
+class TestArgsortOpGPU(TestArgsortOpCPU):
+    def init_place(self):
+        if core.is_compiled_with_cuda():
+            self.place = core.CUDAPlace(0)
+        else:
+            self.place = core.CPUPlace()
+
+
+class TestArgsortOpAxis0CPU(TestArgsortOpCPU):
+    def init_axis(self):
+        self.axis = 0
+
+
+class TestArgsortOpAxis0GPU(TestArgsortOpGPU):
+    def init_axis(self):
+        self.axis = 0
+
+
+class TestArgsortOpAxis1CPU(TestArgsortOpCPU):
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestArgsortOpAxis1GPU(TestArgsortOpGPU):
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestArgsortOpAxis2CPU(TestArgsortOpCPU):
+    def init_axis(self):
+        self.axis = 2
+
+
+class TestArgsortOpAxis2GPU(TestArgsortOpGPU):
+    def init_axis(self):
+        self.axis = 2
+
+
+class TestArgsortOpAxisNeg1CPU(TestArgsortOpCPU):
+    def init_axis(self):
+        self.axis = -1
+
+
+class TestArgsortOpAxisNeg1GPU(TestArgsortOpGPU):
+    def init_axis(self):
+        self.axis = -1
+
+
+class TestArgsortOpAxisNeg2CPU(TestArgsortOpCPU):
+    def init_axis(self):
+        self.axis = -2
+
+
+class TestArgsortOpAxisNeg2GPU(TestArgsortOpGPU):
+    def init_axis(self):
+        self.axis = -2
+
+
+class TestArgsortOpDescendingAxisCPU(TestArgsortOpCPU):
+    def init_direction(self):
+        self.descending = True
+
+
+class TestArgsortOpDescendingAxisGPU(TestArgsortOpGPU):
+    def init_direction(self):
+        self.descending = True
+
+
+class TestArgsortOpDescendingAxis0CPU(TestArgsortOpAxis0CPU):
+    def init_direction(self):
+        self.descending = True
+
+
+class TestArgsortOpDescendingAxis0GPU(TestArgsortOpAxis0GPU):
+    def init_direction(self):
+        self.descending = True
+
+
+class TestArgsortOpDescendingAxis1CPU(TestArgsortOpAxis1CPU):
+    def init_direction(self):
+        self.descending = True
+
+
+class TestArgsortOpDescendingAxis1GPU(TestArgsortOpAxis1GPU):
+    def init_direction(self):
+        self.descending = True
+
+
+class TestArgsortOpDescendingAxis2CPU(TestArgsortOpAxis2CPU):
+    def init_direction(self):
+        self.descending = True
+
+
+class TestArgsortOpDescendingAxis2GPU(TestArgsortOpAxis2GPU):
+    def init_direction(self):
+        self.descending = True
+
+
+class TestArgsortOpDescendingAxisNeg1CPU(TestArgsortOpAxisNeg1CPU):
+    def init_direction(self):
+        self.descending = True
+
+
+class TestArgsortOpDescendingAxisNeg1GPU(TestArgsortOpAxisNeg1GPU):
+    def init_direction(self):
+        self.descending = True
+
+
+class TestArgsortOpDescendingAxisNeg2CPU(TestArgsortOpAxisNeg2CPU):
+    def init_direction(self):
+        self.descending = True
+
+
+class TestArgsortOpDescendingAxisNeg2GPU(TestArgsortOpAxisNeg2GPU):
+    def init_direction(self):
+        self.descending = True
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_attribute_var_deprecated.py b/test/deprecated/legacy_test/test_attribute_var_deprecated.py
new file mode 100644
index 0000000000000..5f09dff909395
--- /dev/null
+++ b/test/deprecated/legacy_test/test_attribute_var_deprecated.py
@@ -0,0 +1,107 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+import unittest
+
+import numpy as np
+
+import paddle
+import paddle.inference as paddle_infer
+from paddle.base.framework import Program, program_guard
+
+paddle.enable_static()
+
+
+class UnittestBase(unittest.TestCase):
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.init_info()
+
+    def tearDwon(self):
+        self.temp_dir.cleanup()
+
+    def init_info(self):
+        self.shapes = None
+        self.save_path = None
+
+    def path_prefix(self):
+        return type(self).__name__
+
+    def infer_prog(self):
+        config = paddle_infer.Config(
+            self.save_path + '.pdmodel', self.save_path + '.pdiparams'
+        )
+        config.disable_mkldnn()
+        predictor = paddle_infer.create_predictor(config)
+        input_names = predictor.get_input_names()
+        for i, shape in enumerate(self.shapes):
+            input_handle = predictor.get_input_handle(input_names[i])
+            self.fake_input = np.random.randn(*shape).astype("float32")
+            input_handle.reshape(shape)
+            input_handle.copy_from_cpu(self.fake_input)
+        predictor.run()
+        output_names = predictor.get_output_names()
+        res = []
+        for out_name in output_names:
+            output_handle = predictor.get_output_handle(out_name)
+            output_data = output_handle.copy_to_cpu()
+            res.append(output_data)
+
+        if len(output_names) == 1:
+            res = res[0]
+
+        return res
+
+
+class TestDropout(UnittestBase):
+    def init_info(self):
+        self.shapes = [[10, 10]]
+        self.save_path = os.path.join(self.temp_dir.name, 'dropout')
+
+    def test_static(self):
+        main_prog = Program()
+        startup_prog = Program()
+        with program_guard(main_prog, startup_prog):
+            fc = paddle.nn.Linear(10, 10)
+            x = paddle.randn(self.shapes[0])
+            x.stop_gradient = False
+            feat = fc(x)
+            # p is a Variable
+            p = paddle.randn([1])
+            out = paddle.nn.functional.dropout(feat, p=p)
+            sgd = paddle.optimizer.SGD()
+            sgd.minimize(paddle.mean(out))
+            # test _to_string
+            self.assertTrue("Var[" in str(main_prog))
+
+            exe = paddle.static.Executor()
+            exe.run(startup_prog)
+            res = exe.run(fetch_list=[x, out])
+            # export model
+            paddle.static.save_inference_model(self.save_path, [x], [out], exe)
+
+            # Test for Inference Predictor
+            infer_out = self.infer_prog()
+            self.assertEqual(infer_out.shape, (10, 10))
+
+            self.assertEqual(
+                main_prog.block(0).ops[4].all_attrs()['dropout_prob'].name,
+                p.name,
+            )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_auto_parallel_partitioner.py b/test/deprecated/legacy_test/test_auto_parallel_partitioner_deprecated.py
similarity index 100%
rename from test/deprecated/legacy_test/test_auto_parallel_partitioner.py
rename to test/deprecated/legacy_test/test_auto_parallel_partitioner_deprecated.py
diff --git a/test/deprecated/legacy_test/test_auto_parallel_partitioner_gpt.py b/test/deprecated/legacy_test/test_auto_parallel_partitioner_gpt_deprecated.py
similarity index 100%
rename from test/deprecated/legacy_test/test_auto_parallel_partitioner_gpt.py
rename to test/deprecated/legacy_test/test_auto_parallel_partitioner_gpt_deprecated.py
diff --git a/test/deprecated/legacy_test/test_avoid_twice_initialization.py b/test/deprecated/legacy_test/test_avoid_twice_initialization_deprecated.py
similarity index 98%
rename from test/deprecated/legacy_test/test_avoid_twice_initialization.py
rename to test/deprecated/legacy_test/test_avoid_twice_initialization_deprecated.py
index 5afb4cd20ccc3..fdca1d76954f3 100644
--- a/test/deprecated/legacy_test/test_avoid_twice_initialization.py
+++ b/test/deprecated/legacy_test/test_avoid_twice_initialization_deprecated.py
@@ -17,6 +17,8 @@
 import paddle
 from paddle import base
 
+paddle.enable_static()
+
 
 class TestAvoidTwiceInitialization(unittest.TestCase):
     def test_avoid_twice_initialization(self):
diff --git a/test/deprecated/legacy_test/test_backward.py b/test/deprecated/legacy_test/test_backward_deprecated.py
similarity index 88%
rename from test/deprecated/legacy_test/test_backward.py
rename to test/deprecated/legacy_test/test_backward_deprecated.py
index 04aeadc038213..64a3dfe7e778d 100644
--- a/test/deprecated/legacy_test/test_backward.py
+++ b/test/deprecated/legacy_test/test_backward_deprecated.py
@@ -19,7 +19,8 @@
 import paddle
 import paddle.nn.functional as F
 from paddle import base, static
-from paddle.base import backward
+
+paddle.enable_static()
 
 
 class BackwardNet:
@@ -411,58 +412,6 @@ def test_gradient_with_optimizer(self):
             self._check_grad_op_name(forward_list, optimized_list)
 
 
-# TODO(Aurelius84): add conditional network test
-class ConditionalNet(BackwardNet):
-    def __init__(self):
-        super().__init__()
-
-
-class TestBackwardUninitializedVariable(unittest.TestCase):
-    """this case is found in yolov5 while to_static.
-    gradient aggregation may cause sum a invalid variable.
-    """
-
-    def test(self):
-        paddle.enable_static()
-        main_prg, startup_prg = paddle.static.Program(), paddle.static.Program()
-        with paddle.static.program_guard(main_prg, startup_prg):
-            gt = paddle.static.data(name='gt', shape=[4], dtype='float32')
-            x = paddle.static.data(name='x', shape=[2], dtype='float32')
-            gt.stop_gradient = True
-            x.stop_gradient = False
-            gt = gt.reshape([4, 1]).reshape([4])
-            loss = (
-                paddle.nn.functional.binary_cross_entropy(x, gt[:2])
-                + (gt[2:4] * x).sum()
-            )
-            exe = paddle.static.Executor()
-            paddle.base.backward.gradients(loss, [])
-            exe.run(startup_prg)
-            # Optimizer
-            out = exe.run(
-                main_prg,
-                feed={
-                    'gt': np.array([1.0, 1.0, 0.0, 0.0], dtype='float32'),
-                    'x': np.array([0.5, 0.5], dtype='float32'),
-                },
-                fetch_list=[loss],
-            )
-            print(out)
-
-
-class TestStripGradSuffix(unittest.TestCase):
-    def test_strip_grad_suffix(self):
-        cases = (
-            ('x@GRAD', 'x'),
-            ('x@GRAD@GRAD', 'x'),
-            ('x@GRAD@RENAME@1', 'x'),
-            ('x@GRAD_slice_0@GRAD', 'x@GRAD_slice_0'),
-            ('grad/grad/x@GRAD@RENAME@block0@1@GRAD', 'x'),
-        )
-        for input_, desired in cases:
-            self.assertEqual(backward._strip_grad_suffix_(input_), desired)
-
-
 if __name__ == '__main__':
     paddle.enable_static()
     unittest.main()
diff --git a/test/deprecated/legacy_test/test_batch_norm_op_deprecated.py b/test/deprecated/legacy_test/test_batch_norm_op_deprecated.py
new file mode 100644
index 0000000000000..9c63d513e09d2
--- /dev/null
+++ b/test/deprecated/legacy_test/test_batch_norm_op_deprecated.py
@@ -0,0 +1,528 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+
+import numpy as np
+from op_test import (
+    _set_use_system_allocator,
+)
+
+import paddle
+from paddle import base
+from paddle.base import core
+
+paddle.enable_static()
+
+_set_use_system_allocator(True)
+
+
+def _cal_mean_variance(x, epsilon, data_format):
+    assert data_format in ['NCHW', 'NHWC']
+    x_shape = x.shape
+    if len(x_shape) == 3:
+        if data_format == "NCHW":  # NCL -> NCL1
+            x = np.reshape(x, (x_shape[0], x_shape[1], x_shape[2], 1))
+        else:  # NLC -> NL1C
+            x = np.reshape(x, (x_shape[0], x_shape[1], 1, x_shape[2]))
+    x_square = x * x
+    axis = (0, 2, 3) if data_format == 'NCHW' else (0, 1, 2)
+    C = x.shape[1] if data_format == 'NCHW' else x.shape[-1]
+    x_square_sum = np.sum(x_square, axis)
+    x_sum = np.sum(x, axis=axis)
+    element_count = np.size(x) / C
+    mean = x_sum / element_count
+    var = x_square_sum / element_count - mean * mean
+    return mean, var
+
+
+def _reference_training(x, scale, offset, epsilon, data_format):
+    x_shape = x.shape
+
+    if len(x_shape) == 3:
+        if data_format == "NCHW":  # NCL -> NCL1
+            x = np.reshape(x, (x_shape[0], x_shape[1], x_shape[2], 1))
+        else:  # NLC -> NL1C
+            x = np.reshape(x, (x_shape[0], x_shape[1], 1, x_shape[2]))
+
+    if data_format == "NCHW":
+        n, c, h, w = x.shape
+        x_square = x * x
+        x_square_sum = np.sum(x_square, (0, 2, 3))
+        x_sum = np.sum(x, axis=(0, 2, 3))
+        element_count = np.size(x) / int(np.shape(x)[1])
+        mean = x_sum / element_count
+        var = x_square_sum / element_count - mean * mean
+        mean_tile = np.reshape(mean, (1, c, 1, 1))
+        mean_tile = np.tile(mean_tile, (n, 1, h, w))
+        var_tile = np.reshape(var, (1, c, 1, 1))
+        var_tile = np.tile(var_tile, (n, 1, h, w))
+        normalized = (x - mean_tile) / np.sqrt(var_tile + epsilon)
+        scale_tile = np.reshape(scale, (1, c, 1, 1))
+        scale_tile = np.tile(scale_tile, (n, 1, h, w))
+        offset_tile = np.reshape(offset, (1, c, 1, 1))
+        offset_tile = np.reshape(offset_tile, (1, c, 1, 1))
+        y = normalized * scale_tile + offset_tile
+    elif data_format == "NHWC":
+        x_square = x * x
+        x_square_sum = np.sum(x_square, (0, 1, 2))
+        x_sum = np.sum(x, axis=(0, 1, 2))
+        element_count = np.size(x) / int(np.shape(x)[-1])
+        mean = x_sum / element_count
+        var = x_square_sum / element_count - mean * mean
+        normalized = (x - mean) / np.sqrt(var + epsilon)
+        y = normalized * scale + offset
+    else:
+        raise ValueError("Unknown data order.")
+
+    if len(x_shape) == 3:
+        y = np.reshape(y, x_shape)
+    return y, mean, var
+
+
+def _reference_grad(x, y_grad, scale, mean, var, epsilon, data_format):
+    # Use the following formulas to calculate gradients:
+    # grad_scale =
+    #   sum(grad_y * (x - mean)) * rsqrt(var + epsilon)
+    #
+    # grad_offset = sum(output_y)
+    #
+    # x_grad =
+    #   1/N * scale * rsqrt(var + epsilon) * (N * grad_y - sum(grad_y) -
+    #   (x - mean) * sum(grad_y * (x - mean)) / (var + epsilon))
+
+    # transfer from (N, C, H, W) to (N, H, W, C) to simplify computation
+    if data_format != "NCHW" and data_format != "NHWC":
+        raise ValueError("Unknown data order.")
+
+    x_shape = x.shape
+    if len(x_shape) == 3:
+        if data_format == "NCHW":  # NCL -> NCL1
+            x = np.reshape(x, (x_shape[0], x_shape[1], x_shape[2], 1))
+            y_grad = np.reshape(y_grad, (x_shape[0], x_shape[1], x_shape[2], 1))
+        else:  # NLC -> NL1C
+            x = np.reshape(x, (x_shape[0], x_shape[1], 1, x_shape[2]))
+            y_grad = np.reshape(y_grad, (x_shape[0], x_shape[1], 1, x_shape[2]))
+
+    if data_format == "NCHW":
+        x = np.transpose(x, (0, 2, 3, 1))
+        y_grad = np.transpose(y_grad, (0, 2, 3, 1))
+
+    x_grad = (
+        scale
+        * (
+            y_grad
+            - np.mean(y_grad, axis=(0, 1, 2))
+            - (x - mean)
+            * np.mean(y_grad * (x - mean), axis=(0, 1, 2))
+            / (var + epsilon)
+        )
+        / np.sqrt(var + epsilon)
+    )
+    grad_scale = np.sum(
+        y_grad * (x - mean) / np.sqrt(var + epsilon), axis=(0, 1, 2)
+    )
+    grad_offset = np.sum(y_grad, axis=(0, 1, 2))
+
+    # transfer back to N, C, H, W
+    if data_format == "NCHW":
+        x_grad = np.transpose(x_grad, (0, 3, 1, 2))
+        x = np.transpose(x, (0, 3, 1, 2))
+        y_grad = np.transpose(y_grad, (0, 3, 1, 2))
+
+    if len(x_shape) == 3:
+        x_grad = np.reshape(x_grad, x_shape)
+
+    return x_grad, grad_scale, grad_offset
+
+
+class TestBatchNormOpTraining(unittest.TestCase):
+    def setUp(self):
+        self.use_mkldnn = False
+        self.fuse_with_relu = False
+        self.data_formats = ["NCHW", "NHWC"]
+        self.momentum = 0.9
+        self.use_momentum_variable = False
+        self.epsilon = 0.00001
+        self.init_kernel_type()
+        self.init_test_case()
+
+    def init_test_case(self):
+        self.use_global_stats = False
+        self.no_grad_set = set()
+        self.fetch_list = [
+            'y',
+            'mean',
+            'variance',
+            'saved_mean',
+            'saved_variance',
+            'x@GRAD',
+            'scale@GRAD',
+            'bias@GRAD',
+        ]
+
+    def __assert_close(self, tensor, np_array, msg, atol=1e-4):
+        np.allclose(np.array(tensor), np_array, atol=atol)
+
+    def ref_forward_backward(
+        self,
+        x,
+        y_grad,
+        scale,
+        bias,
+        mean,
+        variance,
+        epsilon,
+        momentum,
+        shape,
+        data_layout,
+    ):
+        # run forward
+        y, saved_mean, var_ref = _reference_training(
+            x, scale, bias, epsilon, data_layout
+        )
+        mean_out = saved_mean * (1.0 - momentum) + momentum * mean
+        variance_out = var_ref * (1.0 - momentum) + momentum * variance
+        saved_variance = 1.0 / np.sqrt(var_ref + epsilon)
+        # run backward
+        x_grad, scale_grad, bias_grad = _reference_grad(
+            x, y_grad, scale, saved_mean, var_ref, epsilon, data_layout
+        )
+
+        return (
+            y,
+            mean_out,
+            variance_out,
+            saved_mean,
+            saved_variance,
+            x_grad,
+            scale_grad,
+            bias_grad,
+        )
+
+    def set_mean_variance(self, scale_shape, x, data_layout):
+        mean, variance = _cal_mean_variance(x, self.epsilon, data_layout)
+        mean_pre = np.zeros(scale_shape).astype(np.float32)
+        variance_pre = np.ones(scale_shape).astype(np.float32)
+        # computing global mean/variance for one step
+        if self.use_global_stats:
+            mom = self.momentum
+            mean = mean * (1.0 - mom) + mom * mean_pre
+            variance = variance * (1.0 - mom) + mom * variance_pre
+        return mean, variance
+
+    def test_forward_backward(self):
+        def test_with_place(place, data_layout, shape):
+            # attr
+            epsilon = self.epsilon
+            momentum = self.momentum
+            if data_layout == "NCHW":
+                n, c, h, w = shape[0], shape[1], shape[2], shape[3]
+            else:
+                n, h, w, c = shape[0], shape[1], shape[2], shape[3]
+            scale_shape = [c]
+
+            np.random.seed(123)
+            x = np.random.random_sample(shape).astype(np.float32)
+            scale = np.random.random_sample(scale_shape).astype(np.float32)
+            bias = np.random.random_sample(scale_shape).astype(np.float32)
+            mean, variance = self.set_mean_variance(scale_shape, x, data_layout)
+            y_grad = np.random.random_sample(shape).astype(np.float32)
+            momentum_var = np.array([momentum]).astype(np.float32)
+
+            (
+                y,
+                mean_out,
+                variance_out,
+                saved_mean,
+                saved_variance,
+                x_grad,
+                scale_grad,
+                bias_grad,
+            ) = self.ref_forward_backward(
+                x,
+                y_grad,
+                scale,
+                bias,
+                mean,
+                variance,
+                epsilon,
+                momentum,
+                shape,
+                data_layout,
+            )
+
+            var_dict = locals()
+            var_dict['y@GRAD'] = y_grad
+            var_dict['x@GRAD'] = x_grad
+            var_dict['scale@GRAD'] = scale_grad
+            var_dict['bias@GRAD'] = bias_grad
+
+            var_names = [
+                'x',
+                'scale',
+                'bias',
+                'mean',
+                'variance',
+                'y',
+                'saved_mean',
+                'saved_variance',
+                'momentum_var',
+            ]
+            ground_truth = {name: var_dict[name] for name in var_names}
+
+            program = base.Program()
+            with base.program_guard(program):
+                block = program.global_block()
+                for name in ground_truth:
+                    block.create_var(
+                        name=name,
+                        dtype='float32',
+                        shape=ground_truth[name].shape,
+                    )
+                inputs = {
+                    "X": block.var('x'),
+                    "Scale": block.var('scale'),
+                    "Bias": block.var('bias'),
+                    "Mean": block.var('mean'),
+                    "Variance": block.var('variance'),
+                }
+                attrs = {
+                    "epsilon": epsilon,
+                    "is_test": False,
+                    "data_layout": data_layout,
+                    "use_mkldnn": self.use_mkldnn,
+                    "fuse_with_relu": self.fuse_with_relu,
+                    "use_global_stats": self.use_global_stats,
+                }
+                if self.use_momentum_variable:
+                    inputs['MomentumTensor'] = block.var('momentum_var')
+                else:
+                    attrs['momentum'] = momentum
+
+                outputs = {
+                    "Y": block.var('y'),
+                    "MeanOut": block.var('mean'),  # share memory
+                    "VarianceOut": block.var('variance'),  # share memory
+                    "SavedMean": block.var('saved_mean'),
+                    "SavedVariance": block.var('saved_variance'),
+                }
+                block.create_var(name="reserve_space", dtype='float32')
+                outputs["ReserveSpace"] = block.var('reserve_space')
+                bn_op = block.append_op(
+                    type="batch_norm",
+                    inputs=inputs,
+                    outputs=outputs,
+                    attrs=attrs,
+                )
+                block.create_var(name='y@GRAD', dtype='float32', shape=y.shape)
+
+                # generate backward op_desc
+                grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
+                    bn_op.desc, self.no_grad_set, []
+                )
+                grad_op_desc = grad_op_desc_list[0]
+                new_op_desc = block.desc.append_op()
+                new_op_desc.copy_from(grad_op_desc)
+                for var_name in grad_op_desc.output_arg_names():
+                    block.desc.var(var_name.encode("ascii"))
+                grad_op_desc.infer_var_type(block.desc)
+                grad_op_desc.infer_shape(block.desc)
+                for arg in grad_op_desc.output_arg_names():
+                    grad_var = block.desc.find_var(arg.encode("ascii"))
+                    grad_var.set_dtype(core.VarDesc.VarType.FP32)
+
+                program._sync_with_cpp()
+
+                exe = base.Executor(place)
+                out = exe.run(
+                    program,
+                    feed={
+                        name: var_dict[name]
+                        for name in [
+                            'x',
+                            'scale',
+                            'bias',
+                            'mean',
+                            'variance',
+                            'y@GRAD',
+                            'momentum_var',
+                        ]
+                    },
+                    fetch_list=self.fetch_list,
+                )
+
+            for id, name in enumerate(self.fetch_list):
+                if name == 'variance':
+                    self.__assert_close(
+                        var_dict[name], out[id], name, atol=1e-3
+                    )
+                    continue
+                self.__assert_close(var_dict[name], out[id], name)
+            print("op test forward passed: ", str(place), data_layout)
+
+        places = [core.CPUPlace()]
+
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+
+        for place in places:
+            for data_format in self.data_formats:
+                test_with_place(place, data_format, [2, 3, 4, 5])
+
+    def init_kernel_type(self):
+        pass
+
+
+class TestBatchNormOpTrainingCase1(TestBatchNormOpTraining):
+    def init_test_case(self):
+        self.use_global_stats = False
+        self.no_grad_set = {'scale@GRAD', 'bias@GRAD'}
+        self.fetch_list = ['y', 'mean', 'variance', 'x@GRAD']
+
+
+class TestBatchNormOpTrainingCase2(TestBatchNormOpTraining):
+    def init_test_case(self):
+        self.use_global_stats = False
+        self.no_grad_set = set()
+        self.fetch_list = [
+            'y',
+            'mean',
+            'variance',
+            'saved_mean',
+            'saved_variance',
+            'x@GRAD',
+            'scale@GRAD',
+            'bias@GRAD',
+        ]
+        os.environ['FLAGS_cudnn_batchnorm_spatial_persistent'] = "1"
+
+
+class TestBatchNormOpTrainingCase3(TestBatchNormOpTraining):
+    def init_test_case(self):
+        self.use_global_stats = False
+        self.no_grad_set = {'x@GRAD'}
+        self.fetch_list = ['y', 'mean', 'variance', 'scale@GRAD', 'bias@GRAD']
+
+
+class TestBatchNormOpTrainingMomentumVariable(TestBatchNormOpTraining):
+    def init_test_case(self):
+        self.use_momentum_variable = True
+        self.use_global_stats = False
+        self.no_grad_set = set()
+        self.fetch_list = [
+            'y',
+            'mean',
+            'variance',
+            'saved_mean',
+            'saved_variance',
+            'x@GRAD',
+            'scale@GRAD',
+            'bias@GRAD',
+        ]
+
+
+class TestBatchNormOpFreezeStatsTraining(TestBatchNormOpTraining):
+    def init_test_case(self):
+        self.use_global_stats = True
+        self.no_grad_set = set()
+        self.fetch_list = [
+            'y',
+            'mean',
+            'variance',
+            'x@GRAD',
+            'scale@GRAD',
+            'bias@GRAD',
+        ]
+
+    def reference_grad(self, x, y_grad, scale, mean, var, epsilon, data_format):
+        if data_format == "NCHW":
+            x = np.transpose(x, (0, 2, 3, 1))
+            y_grad = np.transpose(y_grad, (0, 2, 3, 1))
+
+        x_grad = scale * y_grad / np.sqrt(var + epsilon)
+        grad_scale = np.sum(
+            y_grad * (x - mean) / np.sqrt(var + epsilon), axis=(0, 1, 2)
+        )
+        grad_offset = np.sum(y_grad, axis=(0, 1, 2))
+
+        # transfer back to N, C, H, W
+        if data_format == "NCHW":
+            x_grad = np.transpose(x_grad, (0, 3, 1, 2))
+            x = np.transpose(x, (0, 3, 1, 2))
+            y_grad = np.transpose(y_grad, (0, 3, 1, 2))
+
+        return x_grad, grad_scale, grad_offset
+
+    def ref_forward_backward(
+        self,
+        x,
+        y_grad,
+        scale,
+        bias,
+        mean,
+        variance,
+        epsilon,
+        momentum,
+        shape,
+        data_layout,
+    ):
+        if data_layout != "NCHW" and data_layout != "NHWC":
+            raise ValueError("Unknown data order.")
+
+        if data_layout == "NCHW":
+            x = np.transpose(x, (0, 2, 3, 1))
+
+        # run normalizaton
+        normalized = (x - mean) / np.sqrt(variance + epsilon)
+        y = normalized * scale + bias
+
+        # transfer back to N, C, H, W
+        if data_layout == "NCHW":
+            x = np.transpose(x, (0, 3, 1, 2))
+            y = np.transpose(y, (0, 3, 1, 2))
+
+        mean_out = mean
+        variance_out = variance
+        saved_variance = 1.0 / np.sqrt(variance + epsilon)
+        # run backward
+        x_grad, scale_grad, bias_grad = self.reference_grad(
+            x, y_grad, scale, mean, variance, epsilon, data_layout
+        )
+
+        return (
+            y,
+            mean_out,
+            variance_out,
+            mean,
+            saved_variance,
+            x_grad,
+            scale_grad,
+            bias_grad,
+        )
+
+
+class TestBatchNormOpFreezeStatsAndScaleBiasTraining(
+    TestBatchNormOpFreezeStatsTraining
+):
+    def init_test_case(self):
+        self.use_global_stats = True
+        self.no_grad_set = {'scale@GRAD', 'bias@GRAD'}
+        self.fetch_list = ['y', 'mean', 'variance', 'x@GRAD']
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_bilinear_tensor_product_op.py b/test/deprecated/legacy_test/test_bilinear_tensor_product_op_deprecated.py
similarity index 64%
rename from test/deprecated/legacy_test/test_bilinear_tensor_product_op.py
rename to test/deprecated/legacy_test/test_bilinear_tensor_product_op_deprecated.py
index 20dcc132e80d6..dee0412af3bc5 100644
--- a/test/deprecated/legacy_test/test_bilinear_tensor_product_op.py
+++ b/test/deprecated/legacy_test/test_bilinear_tensor_product_op_deprecated.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest, paddle_static_guard
+from op_test import paddle_static_guard
 
 import paddle
 from paddle import base
@@ -52,37 +52,5 @@ def test_errors(self):
                 )
 
 
-class TestBilinearTensorProductOp(OpTest):
-    def setUp(self):
-        self.op_type = "bilinear_tensor_product"
-        self.python_api = paddle.nn.functional.bilinear
-        batch_size = 6
-        size0 = 5
-        size1 = 4
-        size2 = 5
-        dtype = "float32" if base.core.is_compiled_with_rocm() else "float64"
-        a = np.random.random((batch_size, size0)).astype(dtype)
-        b = np.random.random((batch_size, size1)).astype(dtype)
-        w = np.random.random((size2, size0, size1)).astype(dtype)
-        bias = np.random.random((1, size2)).astype(dtype)
-        output = np.zeros((batch_size, size2)).astype(dtype)
-        for i in range(size2):
-            w_i = w[i, :, :]
-            output[:, i] = np.sum(np.matmul(a, w_i) * b, axis=1)
-        self.inputs = {
-            'X': a,
-            'Y': b,
-            'Weight': w,
-            'Bias': bias,
-        }
-        self.outputs = {'Out': output + bias}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y', 'Weight', 'Bias'], 'Out')
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/deprecated/legacy_test/test_communicator_geo.py b/test/deprecated/legacy_test/test_communicator_geo_deprecated.py
similarity index 100%
rename from test/deprecated/legacy_test/test_communicator_geo.py
rename to test/deprecated/legacy_test/test_communicator_geo_deprecated.py
diff --git a/test/deprecated/legacy_test/test_compiled_program.py b/test/deprecated/legacy_test/test_compiled_program.py
index 1b6f3698afa4d..66b9039212ff1 100644
--- a/test/deprecated/legacy_test/test_compiled_program.py
+++ b/test/deprecated/legacy_test/test_compiled_program.py
@@ -76,47 +76,5 @@ def test_compiled_program_base(self):
             np.testing.assert_array_equal(float(loss_data), self.loss)
 
 
-class TestCompiledProgramError(unittest.TestCase):
-    def test_program_or_graph_error(self):
-        self.assertRaises(TypeError, base.CompiledProgram, "program")
-
-    def build_simple_model(self):
-        img = paddle.static.data(
-            name='image', shape=[-1, 1, 28, 28], dtype='float32'
-        )
-        label = paddle.static.data(name='label', shape=[-1, 1], dtype='int64')
-        prediction = paddle.static.nn.fc(x=img, size=10, activation='softmax')
-        loss = paddle.nn.functional.cross_entropy(
-            input=prediction, label=label, reduction='none', use_softmax=False
-        )
-        avg_loss = paddle.mean(loss)
-
-    def compile_program(self):
-        with base.program_guard(base.Program()):
-            # build model
-            self.build_simple_model()
-            # compile program
-            program = base.default_main_program()
-            compiled_program = base.CompiledProgram(program)
-            scope = base.global_scope()
-            place = base.CPUPlace()
-            compiled_program._compile(scope, place)
-            return compiled_program, scope, place
-
-    def test_compile_scope_error(self):
-        compiled_program, _, place = self.compile_program()
-        new_scope = core.Scope()
-        with self.assertRaises(ValueError):
-            compiled_program._compile(new_scope, place)
-
-    def test_compile_place_error(self):
-        # need create different place
-        if core.is_compiled_with_cuda():
-            compiled_program, scope, _ = self.compile_program()
-            new_place = base.CUDAPlace(0)
-            with self.assertRaises(ValueError):
-                compiled_program._compile(scope, new_place)
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/deprecated/legacy_test/test_compiled_program_deprecated.py b/test/deprecated/legacy_test/test_compiled_program_deprecated.py
new file mode 100644
index 0000000000000..597cf6af8c7fd
--- /dev/null
+++ b/test/deprecated/legacy_test/test_compiled_program_deprecated.py
@@ -0,0 +1,70 @@
+#   copyright (c) 2020 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+
+import sys
+import unittest
+
+sys.path.append("../../legacy_test")
+
+import paddle
+from paddle import base
+from paddle.base import core
+
+paddle.enable_static()
+
+
+class TestCompiledProgramError(unittest.TestCase):
+    def test_program_or_graph_error(self):
+        self.assertRaises(TypeError, base.CompiledProgram, "program")
+
+    def build_simple_model(self):
+        img = paddle.static.data(
+            name='image', shape=[-1, 1, 28, 28], dtype='float32'
+        )
+        label = paddle.static.data(name='label', shape=[-1, 1], dtype='int64')
+        prediction = paddle.static.nn.fc(x=img, size=10, activation='softmax')
+        loss = paddle.nn.functional.cross_entropy(
+            input=prediction, label=label, reduction='none', use_softmax=False
+        )
+        avg_loss = paddle.mean(loss)
+
+    def compile_program(self):
+        with base.program_guard(base.Program()):
+            # build model
+            self.build_simple_model()
+            # compile program
+            program = base.default_main_program()
+            compiled_program = base.CompiledProgram(program)
+            scope = base.global_scope()
+            place = base.CPUPlace()
+            compiled_program._compile(scope, place)
+            return compiled_program, scope, place
+
+    def test_compile_scope_error(self):
+        compiled_program, _, place = self.compile_program()
+        new_scope = core.Scope()
+        with self.assertRaises(ValueError):
+            compiled_program._compile(new_scope, place)
+
+    def test_compile_place_error(self):
+        # need create different place
+        if core.is_compiled_with_cuda():
+            compiled_program, scope, _ = self.compile_program()
+            new_place = base.CUDAPlace(0)
+            with self.assertRaises(ValueError):
+                compiled_program._compile(scope, new_place)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_conditional_block.py b/test/deprecated/legacy_test/test_conditional_block_deprecated.py
similarity index 100%
rename from test/deprecated/legacy_test/test_conditional_block.py
rename to test/deprecated/legacy_test/test_conditional_block_deprecated.py
diff --git a/test/deprecated/legacy_test/test_conv3d_transpose_part2_op.py b/test/deprecated/legacy_test/test_conv3d_transpose_part2_op_deprecated.py
similarity index 65%
rename from test/deprecated/legacy_test/test_conv3d_transpose_part2_op.py
rename to test/deprecated/legacy_test/test_conv3d_transpose_part2_op_deprecated.py
index 571c961ff4190..02e37f48cda2e 100644
--- a/test/deprecated/legacy_test/test_conv3d_transpose_part2_op.py
+++ b/test/deprecated/legacy_test/test_conv3d_transpose_part2_op_deprecated.py
@@ -18,93 +18,12 @@
 import numpy as np
 
 sys.path.append("../../legacy_test")
-from test_conv3d_transpose_op import (
-    TestConv3DTransposeOp,
-    create_test_cudnn_bf16_class,
-    create_test_cudnn_fp16_class,
-)
 
 import paddle
 from paddle import base
 from paddle.base import core
 
-
-class TestWithSymmetricPad_NHWC(TestConv3DTransposeOp):
-    def init_test_case(self):
-        self.pad = [1, 1, 1]
-        self.stride = [1, 1, 1]
-        self.dilations = [1, 1, 1]
-        self.groups = 1
-        self.input_size = [2, 5, 5, 5, 3]  # NDHWC
-        f_c = self.input_size[-1]
-        self.filter_size = [f_c, 6, 3, 3, 3]
-        self.data_format = 'NHWC'
-
-
-class TestWithAsymmetricPad_NHWC(TestConv3DTransposeOp):
-    def init_test_case(self):
-        self.pad = [1, 0, 1, 0, 1, 2]
-        self.stride = [1, 1, 1]
-        self.dilations = [1, 1, 1]
-        self.groups = 1
-        self.input_size = [2, 5, 5, 5, 3]  # NDHWC
-        f_c = self.input_size[-1]
-        self.filter_size = [f_c, 6, 3, 3, 3]
-        self.data_format = 'NHWC'
-
-
-class TestWithGroups_NHWC(TestConv3DTransposeOp):
-    def init_test_case(self):
-        self.check_no_filter = True
-        self.pad = [1, 1, 1]
-        self.stride = [1, 1, 1]
-        self.dilations = [1, 1, 1]
-        self.groups = 2
-        self.input_size = [2, 5, 5, 5, 4]  # NDHWC
-        f_c = self.input_size[-1]
-        self.filter_size = [f_c, 3, 3, 3, 3]
-        self.data_format = 'NHWC'
-
-
-class TestWithStride_NHWC(TestConv3DTransposeOp):
-    def init_test_case(self):
-        self.pad = [1, 1, 1]
-        self.stride = [2, 2, 2]
-        self.dilations = [1, 1, 1]
-        self.groups = 1
-        self.input_size = [2, 5, 5, 5, 3]  # NCDHW
-        f_c = self.input_size[-1]
-        self.filter_size = [f_c, 6, 3, 3, 3]
-        self.data_format = 'NHWC'
-
-
-class TestWithDilation_NHWC(TestConv3DTransposeOp):
-    def init_test_case(self):
-        self.check_no_input = True
-        self.pad = [1, 1, 1]
-        self.stride = [1, 1, 1]
-        self.dilations = [2, 2, 2]
-        self.groups = 1
-        self.input_size = [2, 5, 5, 5, 3]  # NCDHW
-        f_c = self.input_size[-1]
-        self.filter_size = [f_c, 6, 3, 3, 3]
-        self.data_format = 'NHWC'
-
-
-# ----------------Conv3DTransposeCUDNN fp16----------------
-create_test_cudnn_fp16_class(TestWithSymmetricPad_NHWC)
-create_test_cudnn_fp16_class(TestWithAsymmetricPad_NHWC)
-create_test_cudnn_fp16_class(TestWithGroups_NHWC)
-create_test_cudnn_fp16_class(TestWithStride_NHWC)
-create_test_cudnn_fp16_class(TestWithDilation_NHWC)
-
-
-# ----------------Conv3DTransposeCUDNN bf16----------------
-create_test_cudnn_bf16_class(TestWithSymmetricPad_NHWC)
-create_test_cudnn_bf16_class(TestWithAsymmetricPad_NHWC)
-create_test_cudnn_bf16_class(TestWithGroups_NHWC)
-create_test_cudnn_bf16_class(TestWithStride_NHWC)
-create_test_cudnn_bf16_class(TestWithDilation_NHWC)
+paddle.enable_static()
 
 
 class TestConv3DTransposeAPI(unittest.TestCase):
diff --git a/test/deprecated/legacy_test/test_cost_model.py b/test/deprecated/legacy_test/test_cost_model.py
index 997a5c0c6c47b..77220b5a0cfba 100644
--- a/test/deprecated/legacy_test/test_cost_model.py
+++ b/test/deprecated/legacy_test/test_cost_model.py
@@ -33,26 +33,6 @@ def test_profiler_measure_empty_program(self):
         )
         self.assertEqual(cost_data.get_whole_time_ms(), 0)
 
-    def test_profiler_measure_program(self):
-        main_program = paddle.static.Program()
-        startup_program = paddle.static.Program()
-        with paddle.static.program_guard(main_program, startup_program):
-            # TODO(zhhsplendid): support paddle.static.data, which is uninitialized data
-            data = paddle.ones(name='X', shape=[16, 100], dtype='float32')
-            hidden = paddle.static.nn.fc(data, 10)
-            loss = paddle.mean(hidden)
-        cost_model = core.CostModel()
-        cost_data = cost_model.profile_measure(
-            main_program, startup_program, device, ["time"]
-        )
-        fc_op_time = cost_data.get_op_time_ms(0)
-        mean_op_time = cost_data.get_op_time_ms(1)
-        self.assertGreater(fc_op_time, 0)
-        self.assertGreater(mean_op_time, 0)
-        self.assertGreaterEqual(
-            cost_data.get_whole_time_ms(), fc_op_time + mean_op_time
-        )
-
     def test_static_op_benchmark_cost_model(self):
         op_name = "abs"
         cost_model = CostModel()
diff --git a/test/deprecated/legacy_test/test_cost_model_deprecated.py b/test/deprecated/legacy_test/test_cost_model_deprecated.py
new file mode 100644
index 0000000000000..b86b286ad47db
--- /dev/null
+++ b/test/deprecated/legacy_test/test_cost_model_deprecated.py
@@ -0,0 +1,48 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle
+from paddle.base import core
+
+paddle.enable_static()
+
+device = "gpu" if core.is_compiled_with_cuda() else "cpu"
+
+
+class TestCostModel(unittest.TestCase):
+    def test_profiler_measure_program(self):
+        main_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        with paddle.static.program_guard(main_program, startup_program):
+            # TODO(zhhsplendid): support paddle.static.data, which is uninitialized data
+            data = paddle.ones(name='X', shape=[16, 100], dtype='float32')
+            hidden = paddle.static.nn.fc(data, 10)
+            loss = paddle.mean(hidden)
+        cost_model = core.CostModel()
+        cost_data = cost_model.profile_measure(
+            main_program, startup_program, device, ["time"]
+        )
+        fc_op_time = cost_data.get_op_time_ms(0)
+        mean_op_time = cost_data.get_op_time_ms(1)
+        self.assertGreater(fc_op_time, 0)
+        self.assertGreater(mean_op_time, 0)
+        self.assertGreaterEqual(
+            cost_data.get_whole_time_ms(), fc_op_time + mean_op_time
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_data_norm_op_deprecated.py b/test/deprecated/legacy_test/test_data_norm_op_deprecated.py
new file mode 100644
index 0000000000000..4019ab0c0bf40
--- /dev/null
+++ b/test/deprecated/legacy_test/test_data_norm_op_deprecated.py
@@ -0,0 +1,60 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""This is unit test of Test data_norm Op."""
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle import base
+from paddle.base import Program, program_guard
+
+paddle.enable_static()
+
+
+class TestDataNormOpErrorr(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            x2 = paddle.static.data(name='x2', shape=[-1, 3, 4], dtype="int32")
+            # self.assertRaises(TypeError, base.data_norm, x2)
+            paddle.static.nn.data_norm(
+                input=x2, param_attr={}, enable_scale_and_shift=True
+            )
+
+            # Test input with dimension 1
+            paddle.enable_static()
+            x3 = paddle.static.data("", shape=[0], dtype="float32")
+            self.assertRaises(ValueError, paddle.static.nn.data_norm, x3)
+
+            # The size of input in data_norm should not be 0.
+            def test_0_size():
+                paddle.enable_static()
+                x = paddle.static.data(name='x', shape=[0, 3], dtype='float32')
+                out = paddle.static.nn.data_norm(x, slot_dim=1)
+                cpu = base.core.CPUPlace()
+                exe = base.Executor(cpu)
+                exe.run(base.default_startup_program())
+                test_program = base.default_main_program().clone(for_test=True)
+                exe.run(
+                    test_program,
+                    fetch_list=out,
+                    feed={'x': np.ones([0, 3]).astype('float32')},
+                )
+
+            self.assertRaises(ValueError, test_0_size)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_dataloader_early_reset.py b/test/deprecated/legacy_test/test_dataloader_early_reset_deprecated.py
similarity index 99%
rename from test/deprecated/legacy_test/test_dataloader_early_reset.py
rename to test/deprecated/legacy_test/test_dataloader_early_reset_deprecated.py
index 1c826eb9cb89a..65c1488f66ae0 100644
--- a/test/deprecated/legacy_test/test_dataloader_early_reset.py
+++ b/test/deprecated/legacy_test/test_dataloader_early_reset_deprecated.py
@@ -19,6 +19,8 @@
 import paddle
 from paddle import base
 
+paddle.enable_static()
+
 
 def infinite_reader():
     num = 0
diff --git a/test/deprecated/legacy_test/test_dataloader_keep_order.py b/test/deprecated/legacy_test/test_dataloader_keep_order_deprecated.py
similarity index 99%
rename from test/deprecated/legacy_test/test_dataloader_keep_order.py
rename to test/deprecated/legacy_test/test_dataloader_keep_order_deprecated.py
index a37e1b4939770..8c05c6a97344c 100644
--- a/test/deprecated/legacy_test/test_dataloader_keep_order.py
+++ b/test/deprecated/legacy_test/test_dataloader_keep_order_deprecated.py
@@ -20,6 +20,8 @@
 import paddle
 from paddle import base
 
+paddle.enable_static()
+
 
 def create_reader(shape, batch_number):
     def __impl__():
diff --git a/test/deprecated/legacy_test/test_dataloader_unkeep_order.py b/test/deprecated/legacy_test/test_dataloader_unkeep_order_deprecated.py
similarity index 99%
rename from test/deprecated/legacy_test/test_dataloader_unkeep_order.py
rename to test/deprecated/legacy_test/test_dataloader_unkeep_order_deprecated.py
index 17e5257dffc01..acc272b766f89 100644
--- a/test/deprecated/legacy_test/test_dataloader_unkeep_order.py
+++ b/test/deprecated/legacy_test/test_dataloader_unkeep_order_deprecated.py
@@ -21,6 +21,8 @@
 from paddle import base
 from paddle.base.reader import keep_data_loader_order
 
+paddle.enable_static()
+
 keep_data_loader_order(False)
 
 
diff --git a/test/deprecated/legacy_test/test_dataset.py b/test/deprecated/legacy_test/test_dataset.py
index fd4141c1c8b64..6a005c1f28576 100644
--- a/test/deprecated/legacy_test/test_dataset.py
+++ b/test/deprecated/legacy_test/test_dataset.py
@@ -923,148 +923,6 @@ def setUp(self):
         self.drop_last = False
 
 
-class TestDatasetWithFetchHandler(unittest.TestCase):
-    """
-    Test Dataset With Fetch Handler. TestCases.
-    """
-
-    def net(self):
-        """
-        Test Dataset With Fetch Handler. TestCases.
-        """
-        slots = ["slot1", "slot2", "slot3", "slot4"]
-        slots_vars = []
-        poolings = []
-        for slot in slots:
-            data = paddle.static.data(
-                name=slot, shape=[-1, 1], dtype="int64", lod_level=1
-            )
-            var = paddle.cast(x=data, dtype='float32')
-            pool = paddle.static.nn.sequence_lod.sequence_pool(
-                input=var, pool_type='AVERAGE'
-            )
-
-            slots_vars.append(data)
-            poolings.append(pool)
-
-        concated = paddle.concat(poolings, axis=1)
-        fc = paddle.static.nn.fc(x=concated, activation='tanh', size=32)
-        return slots_vars, fc
-
-    def get_dataset(self, inputs, files):
-        """
-        Test Dataset With Fetch Handler. TestCases.
-
-        Args:
-            inputs(list): inputs of get_dataset
-            files(list): files of  get_dataset
-        """
-        dataset = paddle.distributed.QueueDataset()
-        dataset.init(
-            batch_size=32, thread_num=2, pipe_command="cat", use_var=inputs
-        )
-        dataset.set_filelist(files)
-        return dataset
-
-    def setUp(self):
-        """
-        Test Dataset With Fetch Handler. TestCases.
-        """
-        self.temp_dir = tempfile.TemporaryDirectory()
-        self.filename1 = os.path.join(
-            self.temp_dir.name, "test_queue_dataset_run_a.txt"
-        )
-        self.filename2 = os.path.join(
-            self.temp_dir.name, "test_queue_dataset_run_b.txt"
-        )
-
-        with open(self.filename1, "w") as f:
-            data = "1 1 2 3 3 4 5 5 5 5 1 1\n"
-            data += "1 2 2 3 4 4 6 6 6 6 1 2\n"
-            data += "1 3 2 3 5 4 7 7 7 7 1 3\n"
-            f.write(data)
-        with open(self.filename2, "w") as f:
-            data = "1 4 2 3 3 4 5 5 5 5 1 4\n"
-            data += "1 5 2 3 4 4 6 6 6 6 1 5\n"
-            data += "1 6 2 3 5 4 7 7 7 7 1 6\n"
-            data += "1 7 2 3 6 4 8 8 8 8 1 7\n"
-            f.write(data)
-
-    def tearDown(self):
-        """
-        Test Dataset With Fetch Handler. TestCases.
-        """
-        self.temp_dir.cleanup()
-
-    def test_dataset_none(self):
-        """
-        Test Dataset With Fetch Handler. TestCases.
-        """
-        slots_vars, out = self.net()
-        files = [self.filename1, self.filename2]
-        dataset = self.get_dataset(slots_vars, files)
-
-        exe = base.Executor(base.CPUPlace())
-        exe.run(base.default_startup_program())
-
-        # test dataset->None
-        try:
-            exe.train_from_dataset(base.default_main_program(), None)
-        except ImportError as e:
-            print("warning: we skip trainer_desc_pb2 import problem in windows")
-        except RuntimeError as e:
-            error_msg = "dataset is need and should be initialized"
-            self.assertEqual(error_msg, str(e))
-        except Exception as e:
-            self.assertTrue(False)
-
-    def test_infer_from_dataset(self):
-        """
-        Test Dataset With Fetch Handler. TestCases.
-        """
-        slots_vars, out = self.net()
-        files = [self.filename1, self.filename2]
-        dataset = self.get_dataset(slots_vars, files)
-
-        exe = base.Executor(base.CPUPlace())
-        exe.run(base.default_startup_program())
-
-        try:
-            exe.infer_from_dataset(base.default_main_program(), dataset)
-        except ImportError as e:
-            print("warning: we skip trainer_desc_pb2 import problem in windows")
-        except Exception as e:
-            self.assertTrue(False)
-
-    def test_fetch_handler(self):
-        """
-        Test Dataset With Fetch Handler. TestCases.
-        """
-        slots_vars, out = self.net()
-        files = [self.filename1, self.filename2]
-        dataset = self.get_dataset(slots_vars, files)
-
-        exe = base.Executor(base.CPUPlace())
-        exe.run(base.default_startup_program())
-
-        fh = base.executor.FetchHandler(out.name)
-        fh.help()
-
-        try:
-            exe.train_from_dataset(
-                program=base.default_main_program(),
-                dataset=dataset,
-                fetch_handler=fh,
-            )
-        except ImportError as e:
-            print("warning: we skip trainer_desc_pb2 import problem in windows")
-        except RuntimeError as e:
-            error_msg = "dataset is need and should be initialized"
-            self.assertEqual(error_msg, str(e))
-        except Exception as e:
-            self.assertTrue(False)
-
-
 class TestDataset2(unittest.TestCase):
     """TestCases for Dataset."""
 
diff --git a/test/deprecated/legacy_test/test_dataset_deprecated.py b/test/deprecated/legacy_test/test_dataset_deprecated.py
new file mode 100644
index 0000000000000..f3af35297e284
--- /dev/null
+++ b/test/deprecated/legacy_test/test_dataset_deprecated.py
@@ -0,0 +1,172 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+TestCases for Dataset,
+including create, config, run, etc.
+"""
+
+import os
+import tempfile
+import unittest
+
+import paddle
+from paddle import base
+
+paddle.enable_static()
+
+
+class TestDatasetWithFetchHandler(unittest.TestCase):
+    """
+    Test Dataset With Fetch Handler. TestCases.
+    """
+
+    def net(self):
+        """
+        Test Dataset With Fetch Handler. TestCases.
+        """
+        slots = ["slot1", "slot2", "slot3", "slot4"]
+        slots_vars = []
+        poolings = []
+        for slot in slots:
+            data = paddle.static.data(
+                name=slot, shape=[-1, 1], dtype="int64", lod_level=1
+            )
+            var = paddle.cast(x=data, dtype='float32')
+            pool = paddle.static.nn.sequence_lod.sequence_pool(
+                input=var, pool_type='AVERAGE'
+            )
+
+            slots_vars.append(data)
+            poolings.append(pool)
+
+        concated = paddle.concat(poolings, axis=1)
+        fc = paddle.static.nn.fc(x=concated, activation='tanh', size=32)
+        return slots_vars, fc
+
+    def get_dataset(self, inputs, files):
+        """
+        Test Dataset With Fetch Handler. TestCases.
+
+        Args:
+            inputs(list): inputs of get_dataset
+            files(list): files of  get_dataset
+        """
+        dataset = paddle.distributed.QueueDataset()
+        dataset.init(
+            batch_size=32, thread_num=2, pipe_command="cat", use_var=inputs
+        )
+        dataset.set_filelist(files)
+        return dataset
+
+    def setUp(self):
+        """
+        Test Dataset With Fetch Handler. TestCases.
+        """
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.filename1 = os.path.join(
+            self.temp_dir.name, "test_queue_dataset_run_a.txt"
+        )
+        self.filename2 = os.path.join(
+            self.temp_dir.name, "test_queue_dataset_run_b.txt"
+        )
+
+        with open(self.filename1, "w") as f:
+            data = "1 1 2 3 3 4 5 5 5 5 1 1\n"
+            data += "1 2 2 3 4 4 6 6 6 6 1 2\n"
+            data += "1 3 2 3 5 4 7 7 7 7 1 3\n"
+            f.write(data)
+        with open(self.filename2, "w") as f:
+            data = "1 4 2 3 3 4 5 5 5 5 1 4\n"
+            data += "1 5 2 3 4 4 6 6 6 6 1 5\n"
+            data += "1 6 2 3 5 4 7 7 7 7 1 6\n"
+            data += "1 7 2 3 6 4 8 8 8 8 1 7\n"
+            f.write(data)
+
+    def tearDown(self):
+        """
+        Test Dataset With Fetch Handler. TestCases.
+        """
+        self.temp_dir.cleanup()
+
+    def test_dataset_none(self):
+        """
+        Test Dataset With Fetch Handler. TestCases.
+        """
+        slots_vars, out = self.net()
+        files = [self.filename1, self.filename2]
+        dataset = self.get_dataset(slots_vars, files)
+
+        exe = base.Executor(base.CPUPlace())
+        exe.run(base.default_startup_program())
+
+        # test dataset->None
+        try:
+            exe.train_from_dataset(base.default_main_program(), None)
+        except ImportError as e:
+            print("warning: we skip trainer_desc_pb2 import problem in windows")
+        except RuntimeError as e:
+            error_msg = "dataset is need and should be initialized"
+            self.assertEqual(error_msg, str(e))
+        except Exception as e:
+            self.assertTrue(False)
+
+    def test_infer_from_dataset(self):
+        """
+        Test Dataset With Fetch Handler. TestCases.
+        """
+        slots_vars, out = self.net()
+        files = [self.filename1, self.filename2]
+        dataset = self.get_dataset(slots_vars, files)
+
+        exe = base.Executor(base.CPUPlace())
+        exe.run(base.default_startup_program())
+
+        try:
+            exe.infer_from_dataset(base.default_main_program(), dataset)
+        except ImportError as e:
+            print("warning: we skip trainer_desc_pb2 import problem in windows")
+        except Exception as e:
+            self.assertTrue(False)
+
+    def test_fetch_handler(self):
+        """
+        Test Dataset With Fetch Handler. TestCases.
+        """
+        slots_vars, out = self.net()
+        files = [self.filename1, self.filename2]
+        dataset = self.get_dataset(slots_vars, files)
+
+        exe = base.Executor(base.CPUPlace())
+        exe.run(base.default_startup_program())
+
+        fh = base.executor.FetchHandler(out.name)
+        fh.help()
+
+        try:
+            exe.train_from_dataset(
+                program=base.default_main_program(),
+                dataset=dataset,
+                fetch_handler=fh,
+            )
+        except ImportError as e:
+            print("warning: we skip trainer_desc_pb2 import problem in windows")
+        except RuntimeError as e:
+            error_msg = "dataset is need and should be initialized"
+            self.assertEqual(error_msg, str(e))
+        except Exception as e:
+            self.assertTrue(False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_decoupled_py_reader.py b/test/deprecated/legacy_test/test_decoupled_py_reader_deprecated.py
similarity index 99%
rename from test/deprecated/legacy_test/test_decoupled_py_reader.py
rename to test/deprecated/legacy_test/test_decoupled_py_reader_deprecated.py
index a28b2584a5ff6..f9c75dc7dfd59 100644
--- a/test/deprecated/legacy_test/test_decoupled_py_reader.py
+++ b/test/deprecated/legacy_test/test_decoupled_py_reader_deprecated.py
@@ -20,6 +20,8 @@
 import paddle
 from paddle import base
 
+paddle.enable_static()
+
 EPOCH_NUM = 5
 BATCH_SIZE = 16
 BATCH_NUM = 10
diff --git a/test/deprecated/legacy_test/test_deform_conv2d.py b/test/deprecated/legacy_test/test_deform_conv2d_deprecated.py
similarity index 57%
rename from test/deprecated/legacy_test/test_deform_conv2d.py
rename to test/deprecated/legacy_test/test_deform_conv2d_deprecated.py
index 23208363b5ff9..d8ad41359adfd 100644
--- a/test/deprecated/legacy_test/test_deform_conv2d.py
+++ b/test/deprecated/legacy_test/test_deform_conv2d_deprecated.py
@@ -19,213 +19,6 @@
 
 import paddle
 import paddle.nn.initializer as I
-from paddle.pir_utils import test_with_pir_api
-
-
-class TestDeformConv2D(TestCase):
-    batch_size = 4
-    spatial_shape = (5, 5)
-    dtype = "float32"
-
-    def setUp(self):
-        self.in_channels = 2
-        self.out_channels = 5
-        self.kernel_size = [3, 3]
-        self.padding = [0, 0]
-        self.stride = [1, 1]
-        self.dilation = [1, 1]
-        self.deformable_groups = 1
-        self.groups = 1
-        self.no_bias = True
-
-    def prepare(self):
-        np.random.seed(1)
-        paddle.seed(1)
-        if isinstance(self.kernel_size, int):
-            filter_shape = (self.kernel_size,) * 2
-        else:
-            filter_shape = tuple(self.kernel_size)
-        self.filter_shape = filter_shape
-
-        self.weight = np.random.uniform(
-            -1,
-            1,
-            (self.out_channels, self.in_channels // self.groups) + filter_shape,
-        ).astype(self.dtype)
-        if not self.no_bias:
-            self.bias = np.random.uniform(-1, 1, (self.out_channels,)).astype(
-                self.dtype
-            )
-
-        def out_size(
-            in_size, pad_size, dilation_size, kernel_size, stride_size
-        ):
-            return (
-                in_size + 2 * pad_size - (dilation_size * (kernel_size - 1) + 1)
-            ) / stride_size + 1
-
-        out_h = int(
-            out_size(
-                self.spatial_shape[0],
-                self.padding[0],
-                self.dilation[0],
-                self.kernel_size[0],
-                self.stride[0],
-            )
-        )
-        out_w = int(
-            out_size(
-                self.spatial_shape[1],
-                self.padding[1],
-                self.dilation[1],
-                self.kernel_size[1],
-                self.stride[1],
-            )
-        )
-        out_shape = (out_h, out_w)
-
-        self.input_shape = (
-            self.batch_size,
-            self.in_channels,
-        ) + self.spatial_shape
-
-        self.offset_shape = (
-            self.batch_size,
-            self.deformable_groups * 2 * filter_shape[0] * filter_shape[1],
-        ) + out_shape
-
-        self.mask_shape = (
-            self.batch_size,
-            self.deformable_groups * filter_shape[0] * filter_shape[1],
-        ) + out_shape
-
-        self.input = np.random.uniform(-1, 1, self.input_shape).astype(
-            self.dtype
-        )
-
-        self.offset = np.random.uniform(-1, 1, self.offset_shape).astype(
-            self.dtype
-        )
-
-        self.mask = np.random.uniform(-1, 1, self.mask_shape).astype(self.dtype)
-
-    def static_graph_case_dcn(self):
-        main = paddle.static.Program()
-        start = paddle.static.Program()
-        paddle.enable_static()
-        with paddle.static.program_guard(main, start):
-            x = paddle.static.data(
-                "input", (-1, self.in_channels, -1, -1), dtype=self.dtype
-            )
-            offset = paddle.static.data(
-                "offset",
-                (
-                    -1,
-                    self.deformable_groups
-                    * 2
-                    * self.filter_shape[0]
-                    * self.filter_shape[1],
-                    -1,
-                    -1,
-                ),
-                dtype=self.dtype,
-            )
-            mask = paddle.static.data(
-                "mask",
-                (
-                    -1,
-                    self.deformable_groups
-                    * self.filter_shape[0]
-                    * self.filter_shape[1],
-                    -1,
-                    -1,
-                ),
-                dtype=self.dtype,
-            )
-
-            y_v1 = paddle.vision.ops.DeformConv2D(
-                in_channels=self.in_channels,
-                out_channels=self.out_channels,
-                kernel_size=self.filter_shape,
-                stride=self.stride,
-                padding=self.padding,
-                dilation=self.dilation,
-                groups=self.groups,
-                deformable_groups=self.deformable_groups,
-                weight_attr=I.Assign(self.weight),
-                bias_attr=False if self.no_bias else I.Assign(self.bias),
-            )(x, offset, None)
-
-            y_v2 = paddle.vision.ops.DeformConv2D(
-                in_channels=self.in_channels,
-                out_channels=self.out_channels,
-                kernel_size=self.filter_shape,
-                stride=self.stride,
-                padding=self.padding,
-                dilation=self.dilation,
-                groups=self.groups,
-                deformable_groups=self.deformable_groups,
-                weight_attr=I.Assign(self.weight),
-                bias_attr=False if self.no_bias else I.Assign(self.bias),
-            )(x, offset, mask)
-
-        exe = paddle.static.Executor(self.place)
-        exe.run(start)
-        out_v1, out_v2 = exe.run(
-            main,
-            feed={
-                "input": self.input,
-                "offset": self.offset,
-                "mask": self.mask,
-            },
-            fetch_list=[y_v1, y_v2],
-        )
-        return out_v1, out_v2
-
-    def dygraph_case_dcn(self):
-        paddle.disable_static()
-        x = paddle.to_tensor(self.input)
-        offset = paddle.to_tensor(self.offset)
-        mask = paddle.to_tensor(self.mask)
-
-        bias = None if self.no_bias else paddle.to_tensor(self.bias)
-
-        deform_conv2d = paddle.vision.ops.DeformConv2D(
-            in_channels=self.in_channels,
-            out_channels=self.out_channels,
-            kernel_size=self.kernel_size,
-            stride=self.stride,
-            padding=self.padding,
-            dilation=self.dilation,
-            deformable_groups=self.deformable_groups,
-            groups=self.groups,
-            weight_attr=I.Assign(self.weight),
-            bias_attr=False if self.no_bias else I.Assign(self.bias),
-        )
-
-        y_v1 = deform_conv2d(x, offset)
-        y_v2 = deform_conv2d(x, offset, mask)
-
-        out_v1 = y_v1.numpy()
-        out_v2 = y_v2.numpy()
-
-        return out_v1, out_v2
-
-    @test_with_pir_api
-    def _test_identity(self):
-        self.prepare()
-        static_dcn_v1, static_dcn_v2 = self.static_graph_case_dcn()
-        dy_dcn_v1, dy_dcn_v2 = self.dygraph_case_dcn()
-        np.testing.assert_array_almost_equal(static_dcn_v1, dy_dcn_v1)
-        np.testing.assert_array_almost_equal(static_dcn_v2, dy_dcn_v2)
-
-    def test_identity(self):
-        self.place = paddle.CPUPlace()
-        self._test_identity()
-
-        if paddle.is_compiled_with_cuda():
-            self.place = paddle.CUDAPlace(0)
-            self._test_identity()
 
 
 class TestDeformConv2DFunctional(TestCase):
@@ -536,98 +329,6 @@ def test_identity(self):
             self._test_identity()
 
 
-# testcases for DeformConv2D
-class TestDeformConv2DWithPadding(TestDeformConv2D):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.kernel_size = [3, 3]
-        self.padding = [2, 2]
-        self.stride = [1, 1]
-        self.dilation = [1, 1]
-        self.deformable_groups = 1
-        self.groups = 1
-        self.no_bias = True
-
-
-class TestDeformConv2DWithBias(TestDeformConv2D):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.kernel_size = [3, 3]
-        self.padding = [2, 2]
-        self.stride = [1, 1]
-        self.dilation = [1, 1]
-        self.deformable_groups = 1
-        self.groups = 1
-        self.no_bias = False
-
-
-class TestDeformConv2DWithAsynPadding(TestDeformConv2D):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.kernel_size = [3, 3]
-        self.padding = [1, 2]
-        self.stride = [1, 1]
-        self.dilation = [1, 1]
-        self.deformable_groups = 1
-        self.groups = 1
-        self.no_bias = False
-
-
-class TestDeformConv2DWithDilation(TestDeformConv2D):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.kernel_size = [3, 3]
-        self.padding = [1, 1]
-        self.stride = [1, 1]
-        self.dilation = [3, 3]
-        self.deformable_groups = 1
-        self.groups = 1
-        self.no_bias = False
-
-
-class TestDeformConv2DWithStride(TestDeformConv2D):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.kernel_size = [3, 3]
-        self.padding = [1, 1]
-        self.stride = [2, 2]
-        self.dilation = [1, 1]
-        self.deformable_groups = 1
-        self.groups = 1
-        self.no_bias = False
-
-
-class TestDeformConv2DWithDeformable_Groups(TestDeformConv2D):
-    def setUp(self):
-        self.in_channels = 5
-        self.out_channels = 5
-        self.kernel_size = [3, 3]
-        self.padding = [1, 1]
-        self.stride = [1, 1]
-        self.dilation = [1, 1]
-        self.deformable_groups = 5
-        self.groups = 1
-        self.no_bias = False
-
-
-class TestDeformConv2DWithGroups(TestDeformConv2D):
-    def setUp(self):
-        self.in_channels = 5
-        self.out_channels = 5
-        self.kernel_size = [3, 3]
-        self.padding = [1, 1]
-        self.stride = [1, 1]
-        self.dilation = [1, 1]
-        self.deformable_groups = 1
-        self.groups = 5
-        self.no_bias = False
-
-
 # testcases for deform_conv2d
 class TestDeformConv2DFunctionalWithPadding(TestDeformConv2DFunctional):
     def setUp(self):
@@ -722,27 +423,5 @@ def setUp(self):
         self.no_bias = False
 
 
-class TestDeformConv2DError(unittest.TestCase):
-    @test_with_pir_api
-    def test_input_error(self):
-        def test_input_rank_error():
-            paddle.enable_static()
-            x = paddle.static.data(name='error_x_1', shape=[0], dtype='float32')
-            offset = paddle.static.data(
-                name='error_offset_1', shape=[0], dtype='float32'
-            )
-            mask = paddle.static.data(
-                name='error_mask_1', shape=[0, 0, 0], dtype='float32'
-            )
-            out = paddle.vision.ops.DeformConv2D(
-                in_channels=0,
-                out_channels=0,
-                kernel_size=0,
-                deformable_groups=0,
-            )(x, offset, mask)
-
-        self.assertRaises(AssertionError, test_input_rank_error)
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/deprecated/legacy_test/test_deformable_conv_op_deprecated.py b/test/deprecated/legacy_test/test_deformable_conv_op_deprecated.py
new file mode 100644
index 0000000000000..04bbc51d48fda
--- /dev/null
+++ b/test/deprecated/legacy_test/test_deformable_conv_op_deprecated.py
@@ -0,0 +1,178 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from itertools import product
+
+import numpy as np
+
+import paddle
+
+paddle.enable_static()
+
+
+def dmc_bilinear(data_im, height, width, h, w):
+    h_low = int(np.floor(h))
+    w_low = int(np.floor(w))
+    h_high = h_low + 1
+    w_high = w_low + 1
+
+    lh = h - h_low
+    lw = w - w_low
+    hh = 1 - lh
+    hw = 1 - lw
+
+    v1 = 0
+    if h_low >= 0 and w_low >= 0:
+        v1 = data_im[h_low, w_low]
+    v2 = 0
+    if h_low >= 0 and w_high <= width - 1:
+        v2 = data_im[h_low, w_high]
+    v3 = 0
+    if h_high <= height - 1 and w_low >= 0:
+        v3 = data_im[h_high, w_low]
+    v4 = 0
+    if h_high <= height - 1 and w_high <= width - 1:
+        v4 = data_im[h_high, w_high]
+
+    w1, w2, w3, w4 = hh * hw, hh * lw, lh * hw, lh * lw
+    val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4
+
+    return val
+
+
+def dconv_im2col_gemm(input, offset, mask, filter, group, conv_param):
+    in_n, in_c, in_h, in_w = input.shape
+    out_c, f_c, f_h, f_w = filter.shape
+
+    assert offset.shape == (in_n, 2 * f_h * f_w, in_h, in_w)
+    assert mask.shape == (in_n, f_h * f_w, in_h, in_w)
+    assert f_c * group == in_c
+    assert np.mod(out_c, group) == 0
+
+    stride, pad, dilation = (
+        conv_param['stride'],
+        conv_param['pad'],
+        conv_param['dilation'],
+    )
+    out_h = 1 + (in_h + 2 * pad[0] - (dilation[0] * (f_h - 1) + 1)) // stride[0]
+    out_w = 1 + (in_w + 2 * pad[1] - (dilation[1] * (f_w - 1) + 1)) // stride[1]
+    assert out_h == in_h
+    assert out_w == in_w
+
+    col_buffer = np.zeros((in_n, in_c * f_h * f_w, in_h * in_w))
+    for n, c, h, w, kh, kw in product(
+        range(in_n),
+        range(in_c),
+        range(out_h),
+        range(out_w),
+        range(f_h),
+        range(f_w),
+    ):
+        offset_h_table = offset[n, ::2, h, w].reshape(f_h, f_w)
+        offset_w_table = offset[n, 1::2, h, w].reshape(f_h, f_w)
+        mask_table = mask[n, :, h, w].reshape(f_h, f_w)
+        offset_h = offset_h_table[kh, kw]
+        offset_w = offset_w_table[kh, kw]
+        val = 0
+        im_h = h * stride[0] + kh * dilation[0] + offset_h - pad[0]
+        im_w = w * stride[0] + kw * dilation[0] + offset_w - pad[1]
+        if im_h > -1 and im_w > -1 and im_h < in_h and im_w < in_h:
+            val = dmc_bilinear(input[n, c], in_h, in_w, im_h, im_w)
+        val_out = val * mask_table[kh, kw]
+        col_buffer[n, c * f_h * f_w + kh * f_w + kw, h * in_w + w] = val_out
+
+    out = np.zeros((in_n, group, int(out_c // group), out_h * out_w))
+    weight = filter.reshape(group, int(out_c // group), f_c * f_h * f_w)
+    col_buffer = col_buffer.reshape(
+        (in_n, group, int(in_c // group * f_h * f_w), in_h * in_w)
+    )
+    for n in range(in_n):
+        for g in range(group):
+            out[n, g] = np.matmul(weight[g], col_buffer[n, g])
+    out = out.reshape(in_n, out_c, out_h, out_w)
+    return out
+
+
+class TestModulatedDeformableConvInvalidInput(unittest.TestCase):
+    def test_error(self):
+        def test_invalid_input():
+            paddle.enable_static()
+            input = [1, 3, 32, 32]
+            offset = paddle.static.data(
+                name='offset', shape=[None, 3, 32, 32], dtype='float32'
+            )
+            mask = paddle.static.data(
+                name='mask', shape=[None, 3, 32, 32], dtype='float32'
+            )
+            loss = paddle.static.nn.common.deformable_conv(
+                input, offset, mask, num_filters=4, filter_size=1
+            )
+
+        self.assertRaises(TypeError, test_invalid_input)
+
+        def test_invalid_offset():
+            paddle.enable_static()
+            input = paddle.static.data(
+                name='input', shape=[None, 3, 32, 32], dtype='int32'
+            )
+            offset = paddle.static.data(
+                name='offset', shape=[None, 3, 32, 32], dtype='float32'
+            )
+            mask = paddle.static.data(
+                name='mask', shape=[None, 3, 32, 32], dtype='float32'
+            )
+            loss = paddle.static.nn.common.deformable_conv(
+                input, offset, mask, num_filters=4, filter_size=1
+            )
+
+        self.assertRaises(TypeError, test_invalid_offset)
+
+        def test_invalid_filter():
+            paddle.enable_static()
+            input = paddle.static.data(
+                name='input_filter', shape=[None, 3, 32, 32], dtype='float32'
+            )
+            offset = paddle.static.data(
+                name='offset_filter', shape=[None, 3, 32, 32], dtype='float32'
+            )
+            mask = paddle.static.data(
+                name='mask_filter', shape=[None, 3, 32, 32], dtype='float32'
+            )
+            loss = paddle.static.nn.common.deformable_conv(
+                input, offset, mask, num_filters=4, filter_size=0
+            )
+
+        self.assertRaises(ValueError, test_invalid_filter)
+
+        def test_invalid_groups():
+            paddle.enable_static()
+            input = paddle.static.data(
+                name='input_groups', shape=[1, 1, 1, 1], dtype='float32'
+            )
+            offset = paddle.static.data(
+                name='offset_groups', shape=[1, 1], dtype='float32'
+            )
+            mask = paddle.static.data(
+                name='mask_groups', shape=[1], dtype='float32'
+            )
+            paddle.static.nn.deform_conv2d(
+                input, offset, mask, 1, 1, padding=1, groups=0
+            )
+
+        self.assertRaises(ValueError, test_invalid_groups)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_desc_clone.py b/test/deprecated/legacy_test/test_desc_clone_deprecated.py
similarity index 100%
rename from test/deprecated/legacy_test/test_desc_clone.py
rename to test/deprecated/legacy_test/test_desc_clone_deprecated.py
diff --git a/test/deprecated/legacy_test/test_device_guard.py b/test/deprecated/legacy_test/test_device_guard_deprecated.py
similarity index 75%
rename from test/deprecated/legacy_test/test_device_guard.py
rename to test/deprecated/legacy_test/test_device_guard_deprecated.py
index 502cde95f4007..ddc2e1a1e0958 100644
--- a/test/deprecated/legacy_test/test_device_guard.py
+++ b/test/deprecated/legacy_test/test_device_guard_deprecated.py
@@ -93,57 +93,6 @@ def test_device_guard_with_id(self):
 
         execute(main_program, startup_program)
 
-    def test_cpu_only_op(self):
-        main_program = paddle.static.Program()
-        startup_program = paddle.static.Program()
-        with paddle.static.program_guard(main_program, startup_program):
-            x = paddle.full(
-                shape=[2, 255, 13, 13], fill_value=0.3, dtype='float32'
-            )
-            gt_box = paddle.full(
-                shape=[2, 6, 4], fill_value=0.5, dtype='float32'
-            )
-            gt_label = paddle.full(shape=[2, 6], fill_value=1.0, dtype='int32')
-            gt_score = paddle.full(
-                shape=[2, 6], fill_value=0.5, dtype='float32'
-            )
-            anchors = [
-                10,
-                13,
-                16,
-                30,
-                33,
-                23,
-                30,
-                61,
-                62,
-                45,
-                59,
-                119,
-                116,
-                90,
-                156,
-                198,
-                373,
-                326,
-            ]
-            anchor_mask = [0, 1, 2]
-            with paddle.static.device_guard("gpu"):
-                # yolo_loss only has cpu kernel, so its cpu kernel will be executed
-                loss = paddle.vision.ops.yolo_loss(
-                    x=x,
-                    gt_box=gt_box,
-                    gt_label=gt_label,
-                    gt_score=gt_score,
-                    anchors=anchors,
-                    anchor_mask=anchor_mask,
-                    class_num=80,
-                    ignore_thresh=0.7,
-                    downsample_ratio=32,
-                )
-
-        execute(main_program, startup_program)
-
     @test_with_pir_api
     def test_without_kernel_op(self):
         main_program = paddle.static.Program()
@@ -174,18 +123,6 @@ def test_without_kernel_op(self):
 
         execute(main_program, startup_program)
 
-    def test_error(self):
-        def device_attr():
-            with paddle.static.device_guard("cpu1"):
-                out = paddle.full(shape=[1], fill_value=0.2, dtype='float32')
-
-        def device_attr2():
-            with paddle.static.device_guard("cpu:1"):
-                out = paddle.full(shape=[1], fill_value=0.2, dtype='float32')
-
-        self.assertRaises(ValueError, device_attr)
-        self.assertRaises(ValueError, device_attr2)
-
     # check if op_descs have op_device attr
     def test_op_descs_device_attr(self):
         main_program = paddle.static.Program()
diff --git a/test/deprecated/legacy_test/test_dist_fleet_a_sync_optimizer_auto_async.py b/test/deprecated/legacy_test/test_dist_fleet_a_sync_optimizer_auto_async_deprecated.py
similarity index 100%
rename from test/deprecated/legacy_test/test_dist_fleet_a_sync_optimizer_auto_async.py
rename to test/deprecated/legacy_test/test_dist_fleet_a_sync_optimizer_auto_async_deprecated.py
diff --git a/test/deprecated/legacy_test/test_dist_fleet_a_sync_optimizer_auto.py b/test/deprecated/legacy_test/test_dist_fleet_a_sync_optimizer_auto_deprecated.py
similarity index 100%
rename from test/deprecated/legacy_test/test_dist_fleet_a_sync_optimizer_auto.py
rename to test/deprecated/legacy_test/test_dist_fleet_a_sync_optimizer_auto_deprecated.py
diff --git a/test/deprecated/legacy_test/test_dist_fleet_a_sync_optimizer_auto_geo.py b/test/deprecated/legacy_test/test_dist_fleet_a_sync_optimizer_auto_geo_deprecated.py
similarity index 100%
rename from test/deprecated/legacy_test/test_dist_fleet_a_sync_optimizer_auto_geo.py
rename to test/deprecated/legacy_test/test_dist_fleet_a_sync_optimizer_auto_geo_deprecated.py
diff --git a/test/deprecated/legacy_test/test_dist_fleet_decay.py b/test/deprecated/legacy_test/test_dist_fleet_decay_deprecated.py
similarity index 100%
rename from test/deprecated/legacy_test/test_dist_fleet_decay.py
rename to test/deprecated/legacy_test/test_dist_fleet_decay_deprecated.py
diff --git a/test/deprecated/legacy_test/test_dist_fleet_heter_program.py b/test/deprecated/legacy_test/test_dist_fleet_heter_program_deprecated.py
similarity index 100%
rename from test/deprecated/legacy_test/test_dist_fleet_heter_program.py
rename to test/deprecated/legacy_test/test_dist_fleet_heter_program_deprecated.py
diff --git a/test/deprecated/legacy_test/test_dist_fleet_ps10.py b/test/deprecated/legacy_test/test_dist_fleet_ps10_deprecated.py
similarity index 100%
rename from test/deprecated/legacy_test/test_dist_fleet_ps10.py
rename to test/deprecated/legacy_test/test_dist_fleet_ps10_deprecated.py
diff --git a/test/deprecated/legacy_test/test_dist_fleet_ps13.py b/test/deprecated/legacy_test/test_dist_fleet_ps13_deprecated.py
similarity index 100%
rename from test/deprecated/legacy_test/test_dist_fleet_ps13.py
rename to test/deprecated/legacy_test/test_dist_fleet_ps13_deprecated.py
diff --git a/test/deprecated/legacy_test/test_dist_fleet_ps2.py b/test/deprecated/legacy_test/test_dist_fleet_ps2_deprecated.py
similarity index 100%
rename from test/deprecated/legacy_test/test_dist_fleet_ps2.py
rename to test/deprecated/legacy_test/test_dist_fleet_ps2_deprecated.py
diff --git a/test/deprecated/legacy_test/test_dist_fleet_ps3.py b/test/deprecated/legacy_test/test_dist_fleet_ps3_deprecated.py
similarity index 100%
rename from test/deprecated/legacy_test/test_dist_fleet_ps3.py
rename to test/deprecated/legacy_test/test_dist_fleet_ps3_deprecated.py
diff --git a/test/deprecated/legacy_test/test_dist_fleet_ps4.py b/test/deprecated/legacy_test/test_dist_fleet_ps4_deprecated.py
similarity index 100%
rename from test/deprecated/legacy_test/test_dist_fleet_ps4.py
rename to test/deprecated/legacy_test/test_dist_fleet_ps4_deprecated.py
diff --git a/test/deprecated/legacy_test/test_dist_fleet_ps5.py b/test/deprecated/legacy_test/test_dist_fleet_ps5_deprecated.py
similarity index 100%
rename from test/deprecated/legacy_test/test_dist_fleet_ps5.py
rename to test/deprecated/legacy_test/test_dist_fleet_ps5_deprecated.py
diff --git a/test/deprecated/legacy_test/test_dist_fleet_ps6.py b/test/deprecated/legacy_test/test_dist_fleet_ps6_deprecated.py
similarity index 100%
rename from test/deprecated/legacy_test/test_dist_fleet_ps6.py
rename to test/deprecated/legacy_test/test_dist_fleet_ps6_deprecated.py
diff --git a/test/deprecated/legacy_test/test_dist_fleet_ps7.py b/test/deprecated/legacy_test/test_dist_fleet_ps7_deprecated.py
similarity index 100%
rename from test/deprecated/legacy_test/test_dist_fleet_ps7.py
rename to test/deprecated/legacy_test/test_dist_fleet_ps7_deprecated.py
diff --git a/test/deprecated/legacy_test/test_dist_fleet_ps8.py b/test/deprecated/legacy_test/test_dist_fleet_ps8_deprecated.py
similarity index 100%
rename from test/deprecated/legacy_test/test_dist_fleet_ps8.py
rename to test/deprecated/legacy_test/test_dist_fleet_ps8_deprecated.py
diff --git a/test/deprecated/legacy_test/test_dist_fleet_ps9.py b/test/deprecated/legacy_test/test_dist_fleet_ps9_deprecated.py
similarity index 100%
rename from test/deprecated/legacy_test/test_dist_fleet_ps9.py
rename to test/deprecated/legacy_test/test_dist_fleet_ps9_deprecated.py
diff --git a/test/deprecated/legacy_test/test_dist_fleet_ps.py b/test/deprecated/legacy_test/test_dist_fleet_ps_deprecated.py
similarity index 100%
rename from test/deprecated/legacy_test/test_dist_fleet_ps.py
rename to test/deprecated/legacy_test/test_dist_fleet_ps_deprecated.py
diff --git a/test/deprecated/legacy_test/test_dist_sparse_tensor_load_adagrad.py b/test/deprecated/legacy_test/test_dist_sparse_tensor_load_adagrad.py
index 35dd48accd42a..518819cb15591 100644
--- a/test/deprecated/legacy_test/test_dist_sparse_tensor_load_adagrad.py
+++ b/test/deprecated/legacy_test/test_dist_sparse_tensor_load_adagrad.py
@@ -14,7 +14,7 @@
 
 import unittest
 
-from test_dist_sparse_tensor_load_sgd import TestSparseLoadProgram
+from test_dist_sparse_tensor_load_sgd_deprecated import TestSparseLoadProgram
 
 import paddle
 from paddle import base
diff --git a/test/deprecated/legacy_test/test_dist_sparse_tensor_load_adam.py b/test/deprecated/legacy_test/test_dist_sparse_tensor_load_adam.py
index b5eae0e39807e..88d28dc8cad52 100644
--- a/test/deprecated/legacy_test/test_dist_sparse_tensor_load_adam.py
+++ b/test/deprecated/legacy_test/test_dist_sparse_tensor_load_adam.py
@@ -14,7 +14,7 @@
 
 import unittest
 
-from test_dist_sparse_tensor_load_sgd import TestSparseLoadProgram
+from test_dist_sparse_tensor_load_sgd_deprecated import TestSparseLoadProgram
 
 import paddle
 from paddle import base
diff --git a/test/deprecated/legacy_test/test_dist_sparse_tensor_load_ftrl.py b/test/deprecated/legacy_test/test_dist_sparse_tensor_load_ftrl.py
index 6a1f0175b1619..a1cde59a3d7e0 100644
--- a/test/deprecated/legacy_test/test_dist_sparse_tensor_load_ftrl.py
+++ b/test/deprecated/legacy_test/test_dist_sparse_tensor_load_ftrl.py
@@ -14,7 +14,7 @@
 
 import unittest
 
-from test_dist_sparse_tensor_load_sgd import TestSparseLoadProgram
+from test_dist_sparse_tensor_load_sgd_deprecated import TestSparseLoadProgram
 
 import paddle
 from paddle import base
diff --git a/test/deprecated/legacy_test/test_dist_sparse_tensor_load_momentum.py b/test/deprecated/legacy_test/test_dist_sparse_tensor_load_momentum.py
index b7b590cbb3224..35f8ad09d8f3f 100644
--- a/test/deprecated/legacy_test/test_dist_sparse_tensor_load_momentum.py
+++ b/test/deprecated/legacy_test/test_dist_sparse_tensor_load_momentum.py
@@ -14,7 +14,7 @@
 
 import unittest
 
-from test_dist_sparse_tensor_load_sgd import TestSparseLoadProgram
+from test_dist_sparse_tensor_load_sgd_deprecated import TestSparseLoadProgram
 
 import paddle
 from paddle import base
diff --git a/test/deprecated/legacy_test/test_dist_sparse_tensor_load_rmsprop.py b/test/deprecated/legacy_test/test_dist_sparse_tensor_load_rmsprop.py
index 9ce8e211f1e67..0150ae8027a35 100644
--- a/test/deprecated/legacy_test/test_dist_sparse_tensor_load_rmsprop.py
+++ b/test/deprecated/legacy_test/test_dist_sparse_tensor_load_rmsprop.py
@@ -14,7 +14,7 @@
 
 import unittest
 
-from test_dist_sparse_tensor_load_sgd import TestSparseLoadProgram
+from test_dist_sparse_tensor_load_sgd_deprecated import TestSparseLoadProgram
 
 import paddle
 from paddle import base
diff --git a/test/deprecated/legacy_test/test_dist_sparse_tensor_load_sgd.py b/test/deprecated/legacy_test/test_dist_sparse_tensor_load_sgd_deprecated.py
similarity index 100%
rename from test/deprecated/legacy_test/test_dist_sparse_tensor_load_sgd.py
rename to test/deprecated/legacy_test/test_dist_sparse_tensor_load_sgd_deprecated.py
diff --git a/test/deprecated/legacy_test/test_eager_deletion_delete_vars.py b/test/deprecated/legacy_test/test_eager_deletion_delete_vars_deprecated.py
similarity index 100%
rename from test/deprecated/legacy_test/test_eager_deletion_delete_vars.py
rename to test/deprecated/legacy_test/test_eager_deletion_delete_vars_deprecated.py
diff --git a/test/deprecated/legacy_test/test_ema.py b/test/deprecated/legacy_test/test_ema_deprecated.py
similarity index 99%
rename from test/deprecated/legacy_test/test_ema.py
rename to test/deprecated/legacy_test/test_ema_deprecated.py
index acfd4479fe096..9dbe53ba17176 100644
--- a/test/deprecated/legacy_test/test_ema.py
+++ b/test/deprecated/legacy_test/test_ema_deprecated.py
@@ -19,6 +19,8 @@
 import paddle
 from paddle import base
 
+paddle.enable_static()
+
 
 class TestExponentialMovingAverage(unittest.TestCase):
     def setUp(self):
diff --git a/test/deprecated/legacy_test/test_ema_fleet.py b/test/deprecated/legacy_test/test_ema_fleet_deprecated.py
similarity index 99%
rename from test/deprecated/legacy_test/test_ema_fleet.py
rename to test/deprecated/legacy_test/test_ema_fleet_deprecated.py
index e5ff36545f818..0d19e5c7dabb0 100644
--- a/test/deprecated/legacy_test/test_ema_fleet.py
+++ b/test/deprecated/legacy_test/test_ema_fleet_deprecated.py
@@ -19,6 +19,8 @@
 import paddle
 from paddle import static, utils
 
+paddle.enable_static()
+
 
 def gen_data():
     return np.random.random(size=(10, 5)).astype('float32')
diff --git a/test/deprecated/legacy_test/test_embedding_id_stop_gradient.py b/test/deprecated/legacy_test/test_embedding_id_stop_gradient_deprecated.py
similarity index 99%
rename from test/deprecated/legacy_test/test_embedding_id_stop_gradient.py
rename to test/deprecated/legacy_test/test_embedding_id_stop_gradient_deprecated.py
index e39d9c4674c67..71eb6c67def0f 100644
--- a/test/deprecated/legacy_test/test_embedding_id_stop_gradient.py
+++ b/test/deprecated/legacy_test/test_embedding_id_stop_gradient_deprecated.py
@@ -19,6 +19,8 @@
 import paddle
 from paddle import base
 
+paddle.enable_static()
+
 
 class TestEmbeddingIdStopGradientBase(unittest.TestCase):
     def setUp(self):
diff --git a/test/deprecated/legacy_test/test_entry_attr2.py b/test/deprecated/legacy_test/test_entry_attr2_deprecated.py
similarity index 100%
rename from test/deprecated/legacy_test/test_entry_attr2.py
rename to test/deprecated/legacy_test/test_entry_attr2_deprecated.py
diff --git a/test/deprecated/legacy_test/test_entry_attr.py b/test/deprecated/legacy_test/test_entry_attr_deprecated.py
similarity index 100%
rename from test/deprecated/legacy_test/test_entry_attr.py
rename to test/deprecated/legacy_test/test_entry_attr_deprecated.py
diff --git a/test/deprecated/legacy_test/test_error_clip.py b/test/deprecated/legacy_test/test_error_clip_deprecated.py
similarity index 100%
rename from test/deprecated/legacy_test/test_error_clip.py
rename to test/deprecated/legacy_test/test_error_clip_deprecated.py
diff --git a/test/deprecated/legacy_test/test_executor_and_use_program_cache_deprecated.py b/test/deprecated/legacy_test/test_executor_and_use_program_cache_deprecated.py
new file mode 100644
index 0000000000000..818bb1e48c3a6
--- /dev/null
+++ b/test/deprecated/legacy_test/test_executor_and_use_program_cache_deprecated.py
@@ -0,0 +1,90 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import unittest
+
+import numpy as np
+
+sys.path.append("../../legacy_test")
+
+import paddle
+from paddle import base
+
+paddle.enable_static()
+
+
+class TestExecutor(unittest.TestCase):
+    def test_mul(self):
+        main_program = base.Program()
+        startup_program = base.Program()
+        with base.program_guard(main_program, startup_program):
+            a = paddle.static.data(name='a', shape=[-1, 784], dtype='float32')
+            b = paddle.static.data(name='b', shape=[784, 100], dtype='float32')
+            a.desc.set_need_check_feed(False)
+            b.desc.set_need_check_feed(False)
+            output = paddle.matmul(x=a, y=b)
+
+        # Compute with numpy
+        a_np = np.random.random((100, 784)).astype('float32')
+        b_np = np.random.random((784, 100)).astype('float32')
+        out_np = np.dot(a_np, b_np)
+
+        place = paddle.CPUPlace()
+        exe = base.Executor(place)
+
+        def _train(use_program_cache, max_iters=1):
+            import time
+
+            run_time = 0.0
+            for i in range(max_iters):
+                begin = time.time()
+                outs = exe.run(
+                    program=main_program,
+                    feed={'a': a_np, 'b': b_np},
+                    fetch_list=[output],
+                    use_program_cache=use_program_cache,
+                )
+                end = time.time()
+                run_time += end - begin
+                out = outs[0]
+                self.assertEqual((100, 100), out.shape)
+                np.testing.assert_allclose(out, out_np, rtol=1e-05)
+            return run_time
+
+        max_iters = 3
+        run_time_with_cache = _train(
+            use_program_cache=True, max_iters=max_iters
+        )
+        print("run time with program cache: %f" % run_time_with_cache)
+
+        run_time_without_cache = _train(
+            use_program_cache=False, max_iters=max_iters
+        )
+        print("run time without program cache: %f" % run_time_without_cache)
+
+        run_time_with_cache = _train(
+            use_program_cache=True, max_iters=max_iters
+        )
+        print("run time with program cache: %f" % run_time_with_cache)
+
+        run_time_with_cache = _train(
+            use_program_cache=True, max_iters=max_iters
+        )
+        print("run time with program cache: %f" % run_time_with_cache)
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_executor_check_feed.py b/test/deprecated/legacy_test/test_executor_check_feed_deprecated.py
similarity index 99%
rename from test/deprecated/legacy_test/test_executor_check_feed.py
rename to test/deprecated/legacy_test/test_executor_check_feed_deprecated.py
index 78fb383885ec4..59be3ede8229f 100644
--- a/test/deprecated/legacy_test/test_executor_check_feed.py
+++ b/test/deprecated/legacy_test/test_executor_check_feed_deprecated.py
@@ -17,6 +17,8 @@
 import paddle
 from paddle import base
 
+paddle.enable_static()
+
 
 class TestExecutor(unittest.TestCase):
     def net(self):
diff --git a/test/deprecated/legacy_test/test_executor_feed_non_tensor.py b/test/deprecated/legacy_test/test_executor_feed_non_tensor.py
index b472ccdc9158c..ff3ff65e9652c 100644
--- a/test/deprecated/legacy_test/test_executor_feed_non_tensor.py
+++ b/test/deprecated/legacy_test/test_executor_feed_non_tensor.py
@@ -14,130 +14,10 @@
 
 import unittest
 
-import numpy
-
 import paddle
 from paddle import base
 
 
-class TestExecutor(unittest.TestCase):
-    def net(self):
-        lr = 0.0
-        x = paddle.static.data(name="x", shape=[None, 1], dtype='float32')
-        y = paddle.static.data(name="y", shape=[None, 1], dtype='float32')
-        y_predict = paddle.static.nn.fc(x, size=1)
-
-        cost = paddle.nn.functional.square_error_cost(input=y_predict, label=y)
-        avg_cost = paddle.mean(cost)
-
-        opt = paddle.optimizer.Adam(learning_rate=lr)
-        opt.minimize(avg_cost)
-
-        return paddle.to_tensor(lr), avg_cost
-
-    def test_program_feed_float(self):
-        main_program = base.Program()
-        startup_program = base.Program()
-        scope = base.Scope()
-        with base.program_guard(main_program, startup_program):
-            with base.scope_guard(scope):
-                cpu = base.CPUPlace()
-                exe = base.Executor(cpu)
-                lr, cost = self.net()
-                exe.run(startup_program)
-                train_data = numpy.array([[1.0], [2.0], [3.0], [4.0]]).astype(
-                    'float32'
-                )
-                y_true = numpy.array([[2.0], [4.0], [6.0], [8.0]]).astype(
-                    'float32'
-                )
-                a = 0.01
-                _lr, _ = exe.run(
-                    feed={'x': train_data, 'y': y_true, 'lr': a},
-                    fetch_list=[lr, cost],
-                    return_numpy=False,
-                )
-            self.assertEqual(_lr._dtype(), lr.dtype)
-            self.assertEqual(_lr._dtype(), paddle.float32)
-            self.assertEqual(type(a), float)
-
-    def test_program_feed_int(self):
-        main_program = base.Program()
-        startup_program = base.Program()
-        scope = base.Scope()
-        with base.program_guard(main_program, startup_program):
-            with base.scope_guard(scope):
-                cpu = base.CPUPlace()
-                exe = base.Executor(cpu)
-                lr, cost = self.net()
-                exe.run(startup_program)
-                train_data = numpy.array([[1.0], [2.0], [3.0], [4.0]]).astype(
-                    'float32'
-                )
-                y_true = numpy.array([[2.0], [4.0], [6.0], [8.0]]).astype(
-                    'float32'
-                )
-                a = 0
-                _lr, _ = exe.run(
-                    feed={'x': train_data, 'y': y_true, 'lr': a},
-                    fetch_list=[lr, cost],
-                    return_numpy=False,
-                )
-            self.assertEqual(_lr._dtype(), lr.dtype)
-            self.assertEqual(_lr._dtype(), paddle.float32)
-            self.assertEqual(type(a), int)
-
-    def test_program_feed_list(self):
-        main_program = base.Program()
-        startup_program = base.Program()
-        scope = base.Scope()
-        with base.program_guard(main_program, startup_program):
-            with base.scope_guard(scope):
-                cpu = base.CPUPlace()
-                exe = base.Executor(cpu)
-                lr, cost = self.net()
-                exe.run(startup_program)
-                train_data = [[1.0], [2.0], [3.0], [4.0]]
-                y_true = [[2.0], [4.0], [6.0], [8.0]]
-                a = 0
-                _lr, _ = exe.run(
-                    feed={'x': train_data, 'y': y_true, 'lr': a},
-                    fetch_list=[lr, cost],
-                    return_numpy=False,
-                )
-            self.assertEqual(_lr._dtype(), lr.dtype)
-            self.assertEqual(_lr._dtype(), paddle.float32)
-            self.assertEqual(type(y_true), list)
-
-    def test_compiled_program_feed_scalar(self):
-        main_program = base.Program()
-        startup_program = base.Program()
-        scope = base.Scope()
-        with base.program_guard(main_program, startup_program):
-            with base.scope_guard(scope):
-                lr, cost = self.net()
-                cpu = base.CPUPlace()
-                exe = base.Executor(cpu)
-                exe.run(startup_program)
-                compiled_prog = base.CompiledProgram(main_program)
-                train_data = numpy.array([[1.0], [2.0], [3.0], [4.0]]).astype(
-                    'float32'
-                )
-                y_true = numpy.array([[2.0], [4.0], [6.0], [8.0]]).astype(
-                    'float32'
-                )
-                a = 0.01
-                _lr, _ = exe.run(
-                    compiled_prog,
-                    feed={'x': train_data, 'y': y_true, 'lr': a},
-                    fetch_list=[lr, cost],
-                    return_numpy=False,
-                )
-                self.assertEqual(_lr._dtype(), lr.dtype)
-                self.assertEqual(_lr._dtype(), paddle.float32)
-                self.assertEqual(type(a), float)
-
-
 class TestAsLodTensor(unittest.TestCase):
     def test_as_lodtensor_int32(self):
         cpu = base.CPUPlace()
diff --git a/test/deprecated/legacy_test/test_executor_feed_non_tensor_deprecated.py b/test/deprecated/legacy_test/test_executor_feed_non_tensor_deprecated.py
new file mode 100644
index 0000000000000..cbcdffd11fa2e
--- /dev/null
+++ b/test/deprecated/legacy_test/test_executor_feed_non_tensor_deprecated.py
@@ -0,0 +1,144 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy
+
+import paddle
+from paddle import base
+
+paddle.enable_static()
+
+
+class TestExecutor(unittest.TestCase):
+    def net(self):
+        lr = 0.0
+        x = paddle.static.data(name="x", shape=[None, 1], dtype='float32')
+        y = paddle.static.data(name="y", shape=[None, 1], dtype='float32')
+        y_predict = paddle.static.nn.fc(x, size=1)
+
+        cost = paddle.nn.functional.square_error_cost(input=y_predict, label=y)
+        avg_cost = paddle.mean(cost)
+
+        opt = paddle.optimizer.Adam(learning_rate=lr)
+        opt.minimize(avg_cost)
+
+        return paddle.to_tensor(lr), avg_cost
+
+    def test_program_feed_float(self):
+        main_program = base.Program()
+        startup_program = base.Program()
+        scope = base.Scope()
+        with base.program_guard(main_program, startup_program):
+            with base.scope_guard(scope):
+                cpu = base.CPUPlace()
+                exe = base.Executor(cpu)
+                lr, cost = self.net()
+                exe.run(startup_program)
+                train_data = numpy.array([[1.0], [2.0], [3.0], [4.0]]).astype(
+                    'float32'
+                )
+                y_true = numpy.array([[2.0], [4.0], [6.0], [8.0]]).astype(
+                    'float32'
+                )
+                a = 0.01
+                _lr, _ = exe.run(
+                    feed={'x': train_data, 'y': y_true, 'lr': a},
+                    fetch_list=[lr, cost],
+                    return_numpy=False,
+                )
+            self.assertEqual(_lr._dtype(), lr.dtype)
+            self.assertEqual(_lr._dtype(), paddle.float32)
+            self.assertEqual(type(a), float)
+
+    def test_program_feed_int(self):
+        main_program = base.Program()
+        startup_program = base.Program()
+        scope = base.Scope()
+        with base.program_guard(main_program, startup_program):
+            with base.scope_guard(scope):
+                cpu = base.CPUPlace()
+                exe = base.Executor(cpu)
+                lr, cost = self.net()
+                exe.run(startup_program)
+                train_data = numpy.array([[1.0], [2.0], [3.0], [4.0]]).astype(
+                    'float32'
+                )
+                y_true = numpy.array([[2.0], [4.0], [6.0], [8.0]]).astype(
+                    'float32'
+                )
+                a = 0
+                _lr, _ = exe.run(
+                    feed={'x': train_data, 'y': y_true, 'lr': a},
+                    fetch_list=[lr, cost],
+                    return_numpy=False,
+                )
+            self.assertEqual(_lr._dtype(), lr.dtype)
+            self.assertEqual(_lr._dtype(), paddle.float32)
+            self.assertEqual(type(a), int)
+
+    def test_program_feed_list(self):
+        main_program = base.Program()
+        startup_program = base.Program()
+        scope = base.Scope()
+        with base.program_guard(main_program, startup_program):
+            with base.scope_guard(scope):
+                cpu = base.CPUPlace()
+                exe = base.Executor(cpu)
+                lr, cost = self.net()
+                exe.run(startup_program)
+                train_data = [[1.0], [2.0], [3.0], [4.0]]
+                y_true = [[2.0], [4.0], [6.0], [8.0]]
+                a = 0
+                _lr, _ = exe.run(
+                    feed={'x': train_data, 'y': y_true, 'lr': a},
+                    fetch_list=[lr, cost],
+                    return_numpy=False,
+                )
+            self.assertEqual(_lr._dtype(), lr.dtype)
+            self.assertEqual(_lr._dtype(), paddle.float32)
+            self.assertEqual(type(y_true), list)
+
+    def test_compiled_program_feed_scalar(self):
+        main_program = base.Program()
+        startup_program = base.Program()
+        scope = base.Scope()
+        with base.program_guard(main_program, startup_program):
+            with base.scope_guard(scope):
+                lr, cost = self.net()
+                cpu = base.CPUPlace()
+                exe = base.Executor(cpu)
+                exe.run(startup_program)
+                compiled_prog = base.CompiledProgram(main_program)
+                train_data = numpy.array([[1.0], [2.0], [3.0], [4.0]]).astype(
+                    'float32'
+                )
+                y_true = numpy.array([[2.0], [4.0], [6.0], [8.0]]).astype(
+                    'float32'
+                )
+                a = 0.01
+                _lr, _ = exe.run(
+                    compiled_prog,
+                    feed={'x': train_data, 'y': y_true, 'lr': a},
+                    fetch_list=[lr, cost],
+                    return_numpy=False,
+                )
+                self.assertEqual(_lr._dtype(), lr.dtype)
+                self.assertEqual(_lr._dtype(), paddle.float32)
+                self.assertEqual(type(a), float)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_eye_op.py b/test/deprecated/legacy_test/test_eye_op.py
index 41a4e6aea2f9d..cafbfbd96beb0 100644
--- a/test/deprecated/legacy_test/test_eye_op.py
+++ b/test/deprecated/legacy_test/test_eye_op.py
@@ -17,7 +17,7 @@
 
 import numpy as np
 from op_test import OpTest
-from test_attribute_var import UnittestBase
+from test_attribute_var_deprecated import UnittestBase
 
 import paddle
 from paddle import base
diff --git a/test/deprecated/legacy_test/test_fc_op.py b/test/deprecated/legacy_test/test_fc_op_deprecated.py
similarity index 66%
rename from test/deprecated/legacy_test/test_fc_op.py
rename to test/deprecated/legacy_test/test_fc_op_deprecated.py
index d3d2008c17e15..961fb6e006bad 100644
--- a/test/deprecated/legacy_test/test_fc_op.py
+++ b/test/deprecated/legacy_test/test_fc_op_deprecated.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest, paddle_static_guard
+from op_test import paddle_static_guard
 
 import paddle
 from paddle import base
@@ -54,88 +54,6 @@ def __init__(self, mb, ic, oc, h, w, bias_dims=2):
             self.bias = np.random.random(oc).astype("float32")
 
 
-class TestFCOp(OpTest):
-    def config(self):
-        self.with_bias = True
-        self.with_relu = True
-        self.matrix = MatrixGenerate(1, 10, 15, 3, 3, 2)
-
-    def setUp(self):
-        self.op_type = "fc"
-        self.config()
-
-        if self.with_bias:
-            self.inputs = {
-                'Input': self.matrix.input,
-                'W': self.matrix.weights,
-                'Bias': self.matrix.bias,
-            }
-        else:
-            self.inputs = {'Input': self.matrix.input, 'W': self.matrix.weights}
-
-        if self.with_relu:
-            activation_type = "relu"
-        else:
-            activation_type = ""
-        self.attrs = {'use_mkldnn': False, 'activation_type': activation_type}
-
-        self.outputs = {
-            'Out': fc_refer(self.matrix, self.with_bias, self.with_relu)
-        }
-
-    def test_check_output(self):
-        self.check_output(check_dygraph=False)
-
-
-class TestFCOpNoBias1(TestFCOp):
-    def config(self):
-        self.with_bias = False
-        self.with_relu = False
-        self.matrix = MatrixGenerate(2, 8, 10, 1, 1, 2)
-
-
-class TestFCOpNoBias2(TestFCOp):
-    def config(self):
-        self.with_bias = False
-        self.with_relu = False
-        self.matrix = MatrixGenerate(4, 5, 6, 2, 2, 1)
-
-
-class TestFCOpNoBias4(TestFCOp):
-    def config(self):
-        self.with_bias = False
-        self.with_relu = False
-        self.matrix = MatrixGenerate(1, 32, 64, 3, 3, 1)
-
-
-class TestFCOpWithBias1(TestFCOp):
-    def config(self):
-        self.with_bias = True
-        self.with_relu = False
-        self.matrix = MatrixGenerate(3, 8, 10, 2, 1, 2)
-
-
-class TestFCOpWithBias2(TestFCOp):
-    def config(self):
-        self.with_bias = True
-        self.with_relu = True
-        self.matrix = MatrixGenerate(4, 5, 6, 2, 2, 1)
-
-
-class TestFCOpWithBias3(TestFCOp):
-    def config(self):
-        self.with_bias = True
-        self.with_relu = True
-        self.matrix = MatrixGenerate(1, 64, 32, 3, 3, 1)
-
-
-class TestFCOpWithPadding(TestFCOp):
-    def config(self):
-        self.with_bias = True
-        self.with_relu = True
-        self.matrix = MatrixGenerate(1, 4, 3, 128, 128, 2)
-
-
 class TestFcOp_NumFlattenDims_NegOne(unittest.TestCase):
     def test_api(self):
         def run_program(num_flatten_dims):
diff --git a/test/deprecated/legacy_test/test_feed_data_check_shape_type.py b/test/deprecated/legacy_test/test_feed_data_check_shape_type_deprecated.py
similarity index 99%
rename from test/deprecated/legacy_test/test_feed_data_check_shape_type.py
rename to test/deprecated/legacy_test/test_feed_data_check_shape_type_deprecated.py
index f89247860ba74..40858d81f65dd 100644
--- a/test/deprecated/legacy_test/test_feed_data_check_shape_type.py
+++ b/test/deprecated/legacy_test/test_feed_data_check_shape_type_deprecated.py
@@ -22,6 +22,8 @@
 from paddle import base
 from paddle.base import core
 
+paddle.enable_static()
+
 os.environ['CPU_NUM'] = str(4)
 np.random.seed(123)
 
diff --git a/test/deprecated/legacy_test/test_fleet_base.py b/test/deprecated/legacy_test/test_fleet_base.py
index 2ffd8a747c72d..a8b3203b131c2 100644
--- a/test/deprecated/legacy_test/test_fleet_base.py
+++ b/test/deprecated/legacy_test/test_fleet_base.py
@@ -18,7 +18,6 @@
 import numpy as np
 
 import paddle
-from paddle import base
 from paddle.distributed import fleet
 from paddle.distributed.fleet.base import role_maker
 
@@ -184,45 +183,5 @@ def test_dygraph_method(self):
             final_strategy = fleet._final_strategy()
 
 
-class TestFleetBaseSingleError(unittest.TestCase):
-    def setUp(self):
-        os.environ.pop("PADDLE_TRAINER_ENDPOINTS")
-
-    def gen_data(self):
-        return {
-            "x": np.random.random(size=(128, 32)).astype('float32'),
-            "y": np.random.randint(2, size=(128, 1)).astype('int64'),
-        }
-
-    def test_single_run_collective_minimize(self):
-        def test_single_error():
-            input_x = paddle.static.data(
-                name="x", shape=[-1, 32], dtype='float32'
-            )
-            input_y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64')
-
-            fc_1 = paddle.static.nn.fc(x=input_x, size=64, activation='tanh')
-            prediction = paddle.static.nn.fc(
-                x=fc_1, size=2, activation='softmax'
-            )
-            cost = paddle.nn.functional.cross_entropy(
-                input=prediction,
-                label=input_y,
-                reduction='none',
-                use_softmax=False,
-            )
-            avg_cost = paddle.mean(x=cost)
-            fleet.init(is_collective=True)
-
-        # in non_distributed mode(use `python` to launch), raise error if has multi cards
-        if (
-            base.core.is_compiled_with_cuda()
-            and base.core.get_cuda_device_count() > 1
-        ):
-            self.assertRaises(ValueError, test_single_error)
-        else:
-            test_single_error()
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/deprecated/legacy_test/test_fleet_base_2.py b/test/deprecated/legacy_test/test_fleet_base_2_deprecated.py
similarity index 100%
rename from test/deprecated/legacy_test/test_fleet_base_2.py
rename to test/deprecated/legacy_test/test_fleet_base_2_deprecated.py
diff --git a/test/deprecated/legacy_test/test_fleet_base_3.py b/test/deprecated/legacy_test/test_fleet_base_3_deprecated.py
similarity index 100%
rename from test/deprecated/legacy_test/test_fleet_base_3.py
rename to test/deprecated/legacy_test/test_fleet_base_3_deprecated.py
diff --git a/test/deprecated/legacy_test/test_fleet_base_deprecated.py b/test/deprecated/legacy_test/test_fleet_base_deprecated.py
new file mode 100644
index 0000000000000..496bae7442061
--- /dev/null
+++ b/test/deprecated/legacy_test/test_fleet_base_deprecated.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle import base
+from paddle.distributed import fleet
+
+
+class TestFleetBaseSingleError(unittest.TestCase):
+    def gen_data(self):
+        return {
+            "x": np.random.random(size=(128, 32)).astype('float32'),
+            "y": np.random.randint(2, size=(128, 1)).astype('int64'),
+        }
+
+    def test_single_run_collective_minimize(self):
+        def test_single_error():
+            input_x = paddle.static.data(
+                name="x", shape=[-1, 32], dtype='float32'
+            )
+            input_y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64')
+
+            fc_1 = paddle.static.nn.fc(x=input_x, size=64, activation='tanh')
+            prediction = paddle.static.nn.fc(
+                x=fc_1, size=2, activation='softmax'
+            )
+            cost = paddle.nn.functional.cross_entropy(
+                input=prediction,
+                label=input_y,
+                reduction='none',
+                use_softmax=False,
+            )
+            avg_cost = paddle.mean(x=cost)
+            fleet.init(is_collective=True)
+
+        # in non_distributed mode(use `python` to launch), raise error if has multi cards
+        if (
+            base.core.is_compiled_with_cuda()
+            and base.core.get_cuda_device_count() > 1
+        ):
+            self.assertRaises(ValueError, test_single_error)
+        else:
+            test_single_error()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_fleet.py b/test/deprecated/legacy_test/test_fleet_deprecated.py
similarity index 99%
rename from test/deprecated/legacy_test/test_fleet.py
rename to test/deprecated/legacy_test/test_fleet_deprecated.py
index 0e9eb0579cc98..0c39931932649 100644
--- a/test/deprecated/legacy_test/test_fleet.py
+++ b/test/deprecated/legacy_test/test_fleet_deprecated.py
@@ -16,6 +16,10 @@
 import os
 import unittest
 
+import paddle
+
+paddle.enable_static()
+
 
 class TestFleet1(unittest.TestCase):
     """
diff --git a/test/deprecated/legacy_test/test_fleet_nocvm_1.py b/test/deprecated/legacy_test/test_fleet_nocvm_1_deprecated.py
similarity index 99%
rename from test/deprecated/legacy_test/test_fleet_nocvm_1.py
rename to test/deprecated/legacy_test/test_fleet_nocvm_1_deprecated.py
index d9962c1a27b38..331047d3cb325 100644
--- a/test/deprecated/legacy_test/test_fleet_nocvm_1.py
+++ b/test/deprecated/legacy_test/test_fleet_nocvm_1_deprecated.py
@@ -18,6 +18,8 @@
 
 import paddle
 
+paddle.enable_static()
+
 
 class TestFleet1(unittest.TestCase):
     """
diff --git a/test/deprecated/legacy_test/test_fleet_unitaccessor.py b/test/deprecated/legacy_test/test_fleet_unitaccessor_deprecated.py
similarity index 99%
rename from test/deprecated/legacy_test/test_fleet_unitaccessor.py
rename to test/deprecated/legacy_test/test_fleet_unitaccessor_deprecated.py
index f6e33ed1ee6b3..faf5487e4bf9a 100644
--- a/test/deprecated/legacy_test/test_fleet_unitaccessor.py
+++ b/test/deprecated/legacy_test/test_fleet_unitaccessor_deprecated.py
@@ -18,6 +18,8 @@
 
 import paddle
 
+paddle.enable_static()
+
 
 class TestFleet1(unittest.TestCase):
     """
diff --git a/test/deprecated/legacy_test/test_functional_conv2d.py b/test/deprecated/legacy_test/test_functional_conv2d_deprecated.py
similarity index 64%
rename from test/deprecated/legacy_test/test_functional_conv2d.py
rename to test/deprecated/legacy_test/test_functional_conv2d_deprecated.py
index 0cc69fb0281a4..790cebd22423c 100644
--- a/test/deprecated/legacy_test/test_functional_conv2d.py
+++ b/test/deprecated/legacy_test/test_functional_conv2d_deprecated.py
@@ -22,6 +22,8 @@
 import paddle.nn.functional as F
 from paddle import base
 
+paddle.enable_static()
+
 
 class TestFunctionalConv2D(TestCase):
     batch_size = 4
@@ -196,76 +198,6 @@ def test_identity_gpu(self):
         self._test_identity()
 
 
-class TestFunctionalConv2DError(TestCase):
-    batch_size = 4
-    spatial_shape = (16, 16)
-    dtype = "float32"
-
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = "not_valid"
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.data_format = "NHWC"
-
-    def test_exception(self):
-        self.prepare()
-        with self.assertRaises(ValueError):
-            self.static_graph_case()
-
-    def prepare(self):
-        if isinstance(self.filter_shape, int):
-            filter_shape = (self.filter_shape,) * 2
-        else:
-            filter_shape = tuple(self.filter_shape)
-        self.weight_shape = (
-            self.out_channels,
-            self.in_channels // self.groups,
-        ) + filter_shape
-        self.bias_shape = (self.out_channels,)
-
-    def static_graph_case(self):
-        main = base.Program()
-        start = base.Program()
-        with base.unique_name.guard():
-            with base.program_guard(main, start):
-                self.channel_last = self.data_format == "NHWC"
-                if self.channel_last:
-                    x = x = paddle.static.data(
-                        "input",
-                        (-1, -1, -1, self.in_channels),
-                        dtype=self.dtype,
-                    )
-                else:
-                    x = paddle.static.data(
-                        "input",
-                        (-1, self.in_channels, -1, -1),
-                        dtype=self.dtype,
-                    )
-                weight = paddle.static.data(
-                    "weight", self.weight_shape, dtype=self.dtype
-                )
-                if not self.no_bias:
-                    bias = paddle.static.data(
-                        "bias", self.bias_shape, dtype=self.dtype
-                    )
-                y = F.conv2d(
-                    x,
-                    weight,
-                    None if self.no_bias else bias,
-                    padding=self.padding,
-                    stride=self.stride,
-                    dilation=self.dilation,
-                    groups=self.groups,
-                    data_format=self.data_format,
-                )
-
-
 class TestFunctionalConv2DCase2(TestFunctionalConv2D):
     def setUp(self):
         self.in_channels = 3
@@ -371,126 +303,6 @@ def setUp(self):
         self.data_format = "NCHW"
 
 
-class TestFunctionalConv2DErrorCase2(TestFunctionalConv2DError):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = [[0, 0], [1, 2], [3, 4], [5, 6]]
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = False
-        self.data_format = "NCHW"
-
-
-class TestFunctionalConv2DErrorCase3(TestFunctionalConv2DError):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 4
-        self.filter_shape = 3
-        self.padding = "same"
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 2
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = False
-        self.data_format = "not_valid"
-
-
-class TestFunctionalConv2DErrorCase4(TestFunctionalConv2DError):
-    def setUp(self):
-        self.in_channels = 4
-        self.out_channels = 3
-        self.filter_shape = 3
-        self.padding = "same"
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 2
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = False
-        self.data_format = "NCHW"
-
-
-class TestFunctionalConv2DErrorCase7(TestFunctionalConv2DError):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = "same"
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "not_valid"
-
-
-class TestFunctionalConv2DErrorCase8(TestFunctionalConv2DError):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = [1, 2, 1, 2, 1]
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = True
-        self.data_format = "NCHW"
-
-
-class TestFunctionalConv2DErrorCase9(TestFunctionalConv2DError):
-    def setUp(self):
-        self.in_channels = -5
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = [[0, 0], [0, 0], [3, 2], [1, 2]]
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = False
-        self.data_format = "NCHW"
-
-
-class TestFunctionalConv2DErrorCase10(TestFunctionalConv2DError):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 4
-        self.filter_shape = 3
-        self.padding = "same"
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 2
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = False
-        self.data_format = "NHWC"
-
-
-class TestFunctionalConv2DErrorCase11(TestFunctionalConv2DError):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = 0
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.use_cudnn = False
-        self.data_format = "NHCW"
-
-
 class TestFunctionalConv2DErrorCase12(TestCase):
     def setUp(self):
         self.input = np.array([])
@@ -532,30 +344,6 @@ def static_graph_case(self):
         (out,) = exe.run(main, feed={"input": self.input}, fetch_list=[y])
         return out
 
-    def dygraph_case(self):
-        with dg.guard():
-            x = paddle.to_tensor(self.input, dtype=paddle.float32)
-            w = paddle.to_tensor(self.filter, dtype=paddle.float32)
-            b = (
-                None
-                if self.bias is None
-                else paddle.to_tensor(self.bias, dtype=paddle.float32)
-            )
-            y = F.conv2d(
-                x,
-                w,
-                b,
-                padding=self.padding,
-                stride=self.stride,
-                dilation=self.dilation,
-                groups=self.groups,
-                data_format=self.data_format,
-            )
-
-    def test_dygraph_exception(self):
-        with self.assertRaises(ValueError):
-            self.dygraph_case()
-
     def test_static_exception(self):
         with self.assertRaises(ValueError):
             self.static_graph_case()
diff --git a/test/deprecated/legacy_test/test_functional_conv2d_transpose.py b/test/deprecated/legacy_test/test_functional_conv2d_transpose_deprecated.py
similarity index 99%
rename from test/deprecated/legacy_test/test_functional_conv2d_transpose.py
rename to test/deprecated/legacy_test/test_functional_conv2d_transpose_deprecated.py
index c8d5f88af6de3..8fb2393bbd7a9 100644
--- a/test/deprecated/legacy_test/test_functional_conv2d_transpose.py
+++ b/test/deprecated/legacy_test/test_functional_conv2d_transpose_deprecated.py
@@ -22,6 +22,8 @@
 import paddle.nn.functional as F
 from paddle import base
 
+paddle.enable_static()
+
 
 class TestFunctionalConv2D(TestCase):
     batch_size = 4
diff --git a/test/deprecated/legacy_test/test_functional_conv3d.py b/test/deprecated/legacy_test/test_functional_conv3d_deprecated.py
similarity index 66%
rename from test/deprecated/legacy_test/test_functional_conv3d.py
rename to test/deprecated/legacy_test/test_functional_conv3d_deprecated.py
index 6634d0194670b..68e65a7db30b0 100644
--- a/test/deprecated/legacy_test/test_functional_conv3d.py
+++ b/test/deprecated/legacy_test/test_functional_conv3d_deprecated.py
@@ -22,6 +22,8 @@
 import paddle.nn.functional as F
 from paddle import base
 
+paddle.enable_static()
+
 
 class TestFunctionalConv3D(TestCase):
     batch_size = 4
@@ -196,79 +198,6 @@ def test_identity_gpu(self):
         self._test_identity()
 
 
-class TestFunctionalConv3DError(TestCase):
-    batch_size = 4
-    spatial_shape = (8, 8, 8)
-    dtype = "float32"
-
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = "not_valid"
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.data_format = "NDHWC"
-
-    def test_exception(self):
-        self.prepare()
-        with self.assertRaises(ValueError):
-            self.static_graph_case()
-
-    def prepare(self):
-        if isinstance(self.filter_shape, int):
-            filter_shape = (self.filter_shape,) * 3
-        else:
-            filter_shape = tuple(self.filter_shape)
-        self.weight_shape = (
-            self.out_channels,
-            self.in_channels // self.groups,
-        ) + filter_shape
-        self.bias_shape = (self.out_channels,)
-
-    def static_graph_case(self):
-        main = base.Program()
-        start = base.Program()
-        with base.unique_name.guard():
-            with base.program_guard(main, start):
-                self.channel_last = self.data_format == "NDHWC"
-                if self.channel_last:
-                    x = x = paddle.static.data(
-                        "input",
-                        (-1, -1, -1, -1, self.in_channels),
-                        dtype=self.dtype,
-                    )
-                else:
-                    x = paddle.static.data(
-                        "input",
-                        (-1, self.in_channels, -1, -1, -1),
-                        dtype=self.dtype,
-                    )
-                weight = paddle.static.data(
-                    "weight", self.weight_shape, dtype=self.dtype
-                )
-                if not self.no_bias:
-                    bias = paddle.static.data(
-                        "bias", self.bias_shape, dtype=self.dtype
-                    )
-                y = F.conv3d(
-                    x,
-                    weight,
-                    None if self.no_bias else bias,
-                    padding=self.padding,
-                    stride=self.stride,
-                    dilation=self.dilation,
-                    groups=self.groups,
-                    data_format=self.data_format,
-                )
-
-                if self.act == 'sigmoid':
-                    y = F.sigmoid(y)
-
-
 class TestFunctionalConv3DCase2(TestFunctionalConv3D):
     def setUp(self):
         self.in_channels = 3
@@ -368,104 +297,6 @@ def setUp(self):
         self.data_format = "NCDHW"
 
 
-class TestFunctionalConv3DErrorCase2(TestFunctionalConv3DError):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = [[0, 0], [1, 1], [1, 2], [3, 4], [5, 6]]
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.data_format = "NCDHW"
-
-
-class TestFunctionalConv3DErrorCase3(TestFunctionalConv3DError):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 4
-        self.filter_shape = 3
-        self.padding = "same"
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 2
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.data_format = "not_valid"
-
-
-class TestFunctionalConv3DErrorCase4(TestFunctionalConv3DError):
-    def setUp(self):
-        self.in_channels = 4
-        self.out_channels = 3
-        self.filter_shape = 3
-        self.padding = "same"
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 2
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.data_format = "NCDHW"
-
-
-class TestFunctionalConv3DErrorCase7(TestFunctionalConv3DError):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = "same"
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.data_format = "not_valid"
-
-
-class TestFunctionalConv3DErrorCase8(TestFunctionalConv3DError):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = [1, 2, 1, 2, 1]
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.data_format = "NCDHW"
-
-
-class TestFunctionalConv3DErrorCase9(TestFunctionalConv3DError):
-    def setUp(self):
-        self.in_channels = -5
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = [[0, 0], [0, 0], [3, 2], [1, 2], [1, 1]]
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.data_format = "NCDHW"
-
-
-class TestFunctionalConv3DErrorCase10(TestFunctionalConv3DError):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 4
-        self.filter_shape = 3
-        self.padding = "same"
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 2
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.data_format = "NDHWC"
-
-
 class TestFunctionalConv3DErrorCase11(TestCase):
     def setUp(self):
         self.input = np.array([])
@@ -507,30 +338,6 @@ def static_graph_case(self):
         (out,) = exe.run(main, feed={"input": self.input}, fetch_list=[y])
         return out
 
-    def dygraph_case(self):
-        with dg.guard():
-            x = paddle.to_tensor(self.input, dtype=paddle.float32)
-            w = paddle.to_tensor(self.filter, dtype=paddle.float32)
-            b = (
-                None
-                if self.bias is None
-                else paddle.to_tensor(self.bias, dtype=paddle.float32)
-            )
-            y = F.conv3d(
-                x,
-                w,
-                b,
-                padding=self.padding,
-                stride=self.stride,
-                dilation=self.dilation,
-                groups=self.groups,
-                data_format=self.data_format,
-            )
-
-    def test_dygraph_exception(self):
-        with self.assertRaises(ValueError):
-            self.dygraph_case()
-
     def test_static_exception(self):
         with self.assertRaises(ValueError):
             self.static_graph_case()
diff --git a/test/deprecated/legacy_test/test_functional_conv3d_transpose.py b/test/deprecated/legacy_test/test_functional_conv3d_transpose.py
index a4ea020dd2996..5f5ee23b04cfe 100644
--- a/test/deprecated/legacy_test/test_functional_conv3d_transpose.py
+++ b/test/deprecated/legacy_test/test_functional_conv3d_transpose.py
@@ -23,179 +23,6 @@
 from paddle import base
 
 
-class TestFunctionalConv3DTranspose(TestCase):
-    batch_size = 4
-    spatial_shape = (8, 8, 8)
-    dtype = "float32"
-    output_size = None
-
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = 0
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.data_format = "NDHWC"
-
-    def prepare(self):
-        if isinstance(self.filter_shape, int):
-            filter_shape = (self.filter_shape,) * 3
-        else:
-            filter_shape = tuple(self.filter_shape)
-
-        self.weight = np.random.uniform(
-            -1,
-            1,
-            (self.in_channels, self.out_channels // self.groups) + filter_shape,
-        ).astype(self.dtype)
-        if not self.no_bias:
-            self.bias = np.random.uniform(-1, 1, (self.out_channels,)).astype(
-                self.dtype
-            )
-
-        self.channel_last = self.data_format == "NDHWC"
-        if self.channel_last:
-            self.input_shape = (
-                (self.batch_size,) + self.spatial_shape + (self.in_channels,)
-            )
-        else:
-            self.input_shape = (
-                self.batch_size,
-                self.in_channels,
-            ) + self.spatial_shape
-
-        self.input = np.random.uniform(-1, 1, self.input_shape).astype(
-            self.dtype
-        )
-
-    def static_graph_case_1(self):
-        main = base.Program()
-        start = base.Program()
-        with base.unique_name.guard():
-            with base.program_guard(main, start):
-                if self.channel_last:
-                    x = paddle.static.data(
-                        "input",
-                        (-1, -1, -1, -1, self.in_channels),
-                        dtype=self.dtype,
-                    )
-                else:
-                    x = paddle.static.data(
-                        "input",
-                        (-1, self.in_channels, -1, -1, -1),
-                        dtype=self.dtype,
-                    )
-                y = paddle.static.nn.conv3d_transpose(
-                    x,
-                    self.out_channels,
-                    output_size=self.output_size,
-                    filter_size=self.filter_shape,
-                    stride=self.stride,
-                    padding=self.padding,
-                    dilation=self.dilation,
-                    groups=self.groups,
-                    param_attr=paddle.nn.initializer.Assign(self.weight),
-                    bias_attr=False
-                    if self.no_bias
-                    else paddle.nn.initializer.Assign(self.bias),
-                    act=self.act,
-                    data_format=self.data_format,
-                )
-        exe = base.Executor(self.place)
-        exe.run(start)
-        (out,) = exe.run(main, feed={"input": self.input}, fetch_list=[y])
-        return out
-
-    def static_graph_case_2(self):
-        main = base.Program()
-        start = base.Program()
-        with base.unique_name.guard():
-            with base.program_guard(main, start):
-                if self.channel_last:
-                    x = x = paddle.static.data(
-                        "input",
-                        (-1, -1, -1, -1, self.in_channels),
-                        dtype=self.dtype,
-                    )
-                else:
-                    x = paddle.static.data(
-                        "input",
-                        (-1, self.in_channels, -1, -1, -1),
-                        dtype=self.dtype,
-                    )
-                weight = paddle.static.data(
-                    "weight", self.weight.shape, dtype=self.dtype
-                )
-                if not self.no_bias:
-                    bias = paddle.static.data(
-                        "bias", self.bias.shape, dtype=self.dtype
-                    )
-                y = F.conv3d_transpose(
-                    x,
-                    weight,
-                    None if self.no_bias else bias,
-                    output_size=self.output_size,
-                    padding=self.padding,
-                    stride=self.stride,
-                    dilation=self.dilation,
-                    groups=self.groups,
-                    data_format=self.data_format,
-                )
-                if self.act == 'sigmoid':
-                    y = F.sigmoid(y)
-        exe = base.Executor(self.place)
-        exe.run(start)
-        feed_dict = {"input": self.input, "weight": self.weight}
-        if not self.no_bias:
-            feed_dict["bias"] = self.bias
-        (out,) = exe.run(main, feed=feed_dict, fetch_list=[y])
-        return out
-
-    def dygraph_case(self):
-        with dg.guard(self.place):
-            x = paddle.to_tensor(self.input)
-            weight = paddle.to_tensor(self.weight)
-            bias = None if self.no_bias else paddle.to_tensor(self.bias)
-            y = F.conv3d_transpose(
-                x,
-                weight,
-                bias,
-                output_size=self.output_size,
-                padding=self.padding,
-                stride=self.stride,
-                dilation=self.dilation,
-                groups=self.groups,
-                data_format=self.data_format,
-            )
-            if self.act == 'sigmoid':
-                y = F.sigmoid(y)
-            out = y.numpy()
-        return out
-
-    def _test_identity(self):
-        self.prepare()
-        out1 = self.static_graph_case_1()
-        out2 = self.static_graph_case_2()
-        out3 = self.dygraph_case()
-        np.testing.assert_array_almost_equal(out1, out2)
-        np.testing.assert_array_almost_equal(out2, out3)
-
-    def test_identity_cpu(self):
-        self.place = base.CPUPlace()
-        self._test_identity()
-
-    @unittest.skipIf(
-        not base.core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-    )
-    def test_identity_gpu(self):
-        self.place = base.CUDAPlace(0)
-        self._test_identity()
-
-
 class TestFunctionalConv3DTransposeError(TestCase):
     batch_size = 4
     spatial_shape = (8, 8, 8)
@@ -270,147 +97,6 @@ def static_graph_case(self):
                     y = F.sigmoid(y)
 
 
-class TestFunctionalConv3DTransposeCase2(TestFunctionalConv3DTranspose):
-    def setUp(self):
-        self.in_channels = 3
-        self.out_channels = 5
-        self.filter_shape = 3
-        self.padding = 0
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.data_format = "NCDHW"
-
-
-class TestFunctionalConv3DTransposeCase3(TestFunctionalConv3DTranspose):
-    def setUp(self):
-        self.in_channels = 4
-        self.out_channels = 6
-        self.filter_shape = 3
-        self.padding = 0
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 2
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.data_format = "NDHWC"
-
-
-class TestFunctionalConv3DTransposeCase4(TestFunctionalConv3DTranspose):
-    def setUp(self):
-        self.in_channels = 4
-        self.out_channels = 6
-        self.filter_shape = 3
-        self.padding = "same"
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 2
-        self.no_bias = True
-        self.act = "sigmoid"
-        self.data_format = "NDHWC"
-
-
-class TestFunctionalConv3DTransposeCase5(TestFunctionalConv3DTranspose):
-    def setUp(self):
-        self.in_channels = 4
-        self.out_channels = 6
-        self.filter_shape = 3
-        self.padding = "valid"
-        self.stride = (1, 2, 1)
-        self.dilation = (2, 1, 1)
-        self.groups = 2
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.data_format = "NDHWC"
-
-
-class TestFunctionalConv3DTransposeCase6(TestFunctionalConv3DTranspose):
-    def setUp(self):
-        self.in_channels = 4
-        self.out_channels = 4
-        self.filter_shape = 3
-        self.padding = "valid"
-        self.stride = (1, 2, 1)
-        self.dilation = 1
-        self.groups = 4
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.data_format = "NDHWC"
-
-
-class TestFunctionalConv3DTransposeCase7(TestFunctionalConv3DTranspose):
-    def setUp(self):
-        self.in_channels = 4
-        self.out_channels = 4
-        self.filter_shape = 3
-        self.padding = "valid"
-        self.output_size = (10, 17, 10)
-        self.stride = (1, 2, 1)
-        self.dilation = 1
-        self.groups = 1
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.data_format = "NCDHW"
-
-
-class TestFunctionalConv3DTransposeCase8(TestFunctionalConv3DTranspose):
-    def setUp(self):
-        self.in_channels = 4
-        self.out_channels = 6
-        self.filter_shape = 3
-        self.padding = [[0, 0], [1, 2], [1, 2], [2, 1], [0, 0]]
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 2
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.data_format = "NDHWC"
-
-
-class TestFunctionalConv3DTransposeCase9(TestFunctionalConv3DTranspose):
-    def setUp(self):
-        self.in_channels = 4
-        self.out_channels = 6
-        self.filter_shape = 3
-        self.padding = [[0, 0], [0, 0], [1, 1], [1, 1], [2, 2]]
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 2
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.data_format = "NCDHW"
-
-
-class TestFunctionalConv3DTransposeCase10(TestFunctionalConv3DTranspose):
-    def setUp(self):
-        self.in_channels = 4
-        self.out_channels = 6
-        self.filter_shape = 3
-        self.padding = [1, 1, 2, 2, 1, 1]
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 2
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.data_format = "NCDHW"
-
-
-class TestFunctionalConv3DTransposeCase11(TestFunctionalConv3DTranspose):
-    def setUp(self):
-        self.in_channels = 4
-        self.out_channels = 6
-        self.filter_shape = 3
-        self.padding = [1, 2, 1]
-        self.stride = 1
-        self.dilation = 1
-        self.groups = 2
-        self.no_bias = False
-        self.act = "sigmoid"
-        self.data_format = "NCDHW"
-
-
 class TestFunctionalConv3DTransposeErrorCase2(
     TestFunctionalConv3DTransposeError
 ):
@@ -537,34 +223,6 @@ def setUp(self):
         self.groups = 1
         self.data_format = "NCDHW"
 
-    def static_graph_case(self):
-        main = base.Program()
-        start = base.Program()
-        with base.unique_name.guard():
-            with base.program_guard(main, start):
-                x = paddle.static.data(
-                    "input", self.input.shape, dtype=paddle.float32
-                )
-                y = paddle.static.nn.conv3d_transpose(
-                    x,
-                    self.num_filters,
-                    self.filter_size,
-                    stride=self.stride,
-                    padding=self.padding,
-                    dilation=self.dilation,
-                    groups=self.groups,
-                    param_attr=paddle.nn.initializer.Assign(self.filter),
-                    bias_attr=False
-                    if self.bias is None
-                    else paddle.nn.initializer.Assign(self.bias),
-                    act=None,
-                    data_format=self.data_format,
-                )
-        exe = base.Executor()
-        exe.run(start)
-        (out,) = exe.run(main, feed={"input": self.input}, fetch_list=[y])
-        return out
-
     def dygraph_case(self):
         with dg.guard():
             x = paddle.to_tensor(self.input, dtype=paddle.float32)
@@ -589,10 +247,6 @@ def test_dygraph_exception(self):
         with self.assertRaises(ValueError):
             self.dygraph_case()
 
-    def test_static_exception(self):
-        with self.assertRaises(ValueError):
-            self.static_graph_case()
-
 
 class TestFunctionalConv3DTransposeErrorCase11(
     TestFunctionalConv3DTransposeErrorCase10
diff --git a/test/deprecated/legacy_test/test_functional_conv3d_transpose_deprecated.py b/test/deprecated/legacy_test/test_functional_conv3d_transpose_deprecated.py
new file mode 100644
index 0000000000000..5be6713d7f2e8
--- /dev/null
+++ b/test/deprecated/legacy_test/test_functional_conv3d_transpose_deprecated.py
@@ -0,0 +1,405 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from unittest import TestCase
+
+import numpy as np
+
+import paddle
+import paddle.base.dygraph as dg
+import paddle.nn.functional as F
+from paddle import base
+
+paddle.enable_static()
+
+
+class TestFunctionalConv3DTranspose(TestCase):
+    batch_size = 4
+    spatial_shape = (8, 8, 8)
+    dtype = "float32"
+    output_size = None
+
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 5
+        self.filter_shape = 3
+        self.padding = 0
+        self.stride = 1
+        self.dilation = 1
+        self.groups = 1
+        self.no_bias = False
+        self.act = "sigmoid"
+        self.data_format = "NDHWC"
+
+    def prepare(self):
+        if isinstance(self.filter_shape, int):
+            filter_shape = (self.filter_shape,) * 3
+        else:
+            filter_shape = tuple(self.filter_shape)
+
+        self.weight = np.random.uniform(
+            -1,
+            1,
+            (self.in_channels, self.out_channels // self.groups) + filter_shape,
+        ).astype(self.dtype)
+        if not self.no_bias:
+            self.bias = np.random.uniform(-1, 1, (self.out_channels,)).astype(
+                self.dtype
+            )
+
+        self.channel_last = self.data_format == "NDHWC"
+        if self.channel_last:
+            self.input_shape = (
+                (self.batch_size,) + self.spatial_shape + (self.in_channels,)
+            )
+        else:
+            self.input_shape = (
+                self.batch_size,
+                self.in_channels,
+            ) + self.spatial_shape
+
+        self.input = np.random.uniform(-1, 1, self.input_shape).astype(
+            self.dtype
+        )
+
+    def static_graph_case_1(self):
+        main = base.Program()
+        start = base.Program()
+        with base.unique_name.guard():
+            with base.program_guard(main, start):
+                if self.channel_last:
+                    x = paddle.static.data(
+                        "input",
+                        (-1, -1, -1, -1, self.in_channels),
+                        dtype=self.dtype,
+                    )
+                else:
+                    x = paddle.static.data(
+                        "input",
+                        (-1, self.in_channels, -1, -1, -1),
+                        dtype=self.dtype,
+                    )
+                y = paddle.static.nn.conv3d_transpose(
+                    x,
+                    self.out_channels,
+                    output_size=self.output_size,
+                    filter_size=self.filter_shape,
+                    stride=self.stride,
+                    padding=self.padding,
+                    dilation=self.dilation,
+                    groups=self.groups,
+                    param_attr=paddle.nn.initializer.Assign(self.weight),
+                    bias_attr=False
+                    if self.no_bias
+                    else paddle.nn.initializer.Assign(self.bias),
+                    act=self.act,
+                    data_format=self.data_format,
+                )
+        exe = base.Executor(self.place)
+        exe.run(start)
+        (out,) = exe.run(main, feed={"input": self.input}, fetch_list=[y])
+        return out
+
+    def static_graph_case_2(self):
+        main = base.Program()
+        start = base.Program()
+        with base.unique_name.guard():
+            with base.program_guard(main, start):
+                if self.channel_last:
+                    x = x = paddle.static.data(
+                        "input",
+                        (-1, -1, -1, -1, self.in_channels),
+                        dtype=self.dtype,
+                    )
+                else:
+                    x = paddle.static.data(
+                        "input",
+                        (-1, self.in_channels, -1, -1, -1),
+                        dtype=self.dtype,
+                    )
+                weight = paddle.static.data(
+                    "weight", self.weight.shape, dtype=self.dtype
+                )
+                if not self.no_bias:
+                    bias = paddle.static.data(
+                        "bias", self.bias.shape, dtype=self.dtype
+                    )
+                y = F.conv3d_transpose(
+                    x,
+                    weight,
+                    None if self.no_bias else bias,
+                    output_size=self.output_size,
+                    padding=self.padding,
+                    stride=self.stride,
+                    dilation=self.dilation,
+                    groups=self.groups,
+                    data_format=self.data_format,
+                )
+                if self.act == 'sigmoid':
+                    y = F.sigmoid(y)
+        exe = base.Executor(self.place)
+        exe.run(start)
+        feed_dict = {"input": self.input, "weight": self.weight}
+        if not self.no_bias:
+            feed_dict["bias"] = self.bias
+        (out,) = exe.run(main, feed=feed_dict, fetch_list=[y])
+        return out
+
+    def dygraph_case(self):
+        with dg.guard(self.place):
+            x = paddle.to_tensor(self.input)
+            weight = paddle.to_tensor(self.weight)
+            bias = None if self.no_bias else paddle.to_tensor(self.bias)
+            y = F.conv3d_transpose(
+                x,
+                weight,
+                bias,
+                output_size=self.output_size,
+                padding=self.padding,
+                stride=self.stride,
+                dilation=self.dilation,
+                groups=self.groups,
+                data_format=self.data_format,
+            )
+            if self.act == 'sigmoid':
+                y = F.sigmoid(y)
+            out = y.numpy()
+        return out
+
+    def _test_identity(self):
+        self.prepare()
+        out1 = self.static_graph_case_1()
+        out2 = self.static_graph_case_2()
+        out3 = self.dygraph_case()
+        np.testing.assert_array_almost_equal(out1, out2)
+        np.testing.assert_array_almost_equal(out2, out3)
+
+    def test_identity_cpu(self):
+        self.place = base.CPUPlace()
+        self._test_identity()
+
+    @unittest.skipIf(
+        not base.core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+    )
+    def test_identity_gpu(self):
+        self.place = base.CUDAPlace(0)
+        self._test_identity()
+
+
+class TestFunctionalConv3DTransposeCase2(TestFunctionalConv3DTranspose):
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 5
+        self.filter_shape = 3
+        self.padding = 0
+        self.stride = 1
+        self.dilation = 1
+        self.groups = 1
+        self.no_bias = False
+        self.act = "sigmoid"
+        self.data_format = "NCDHW"
+
+
+class TestFunctionalConv3DTransposeCase3(TestFunctionalConv3DTranspose):
+    def setUp(self):
+        self.in_channels = 4
+        self.out_channels = 6
+        self.filter_shape = 3
+        self.padding = 0
+        self.stride = 1
+        self.dilation = 1
+        self.groups = 2
+        self.no_bias = False
+        self.act = "sigmoid"
+        self.data_format = "NDHWC"
+
+
+class TestFunctionalConv3DTransposeCase4(TestFunctionalConv3DTranspose):
+    def setUp(self):
+        self.in_channels = 4
+        self.out_channels = 6
+        self.filter_shape = 3
+        self.padding = "same"
+        self.stride = 1
+        self.dilation = 1
+        self.groups = 2
+        self.no_bias = True
+        self.act = "sigmoid"
+        self.data_format = "NDHWC"
+
+
+class TestFunctionalConv3DTransposeCase5(TestFunctionalConv3DTranspose):
+    def setUp(self):
+        self.in_channels = 4
+        self.out_channels = 6
+        self.filter_shape = 3
+        self.padding = "valid"
+        self.stride = (1, 2, 1)
+        self.dilation = (2, 1, 1)
+        self.groups = 2
+        self.no_bias = False
+        self.act = "sigmoid"
+        self.data_format = "NDHWC"
+
+
+class TestFunctionalConv3DTransposeCase6(TestFunctionalConv3DTranspose):
+    def setUp(self):
+        self.in_channels = 4
+        self.out_channels = 4
+        self.filter_shape = 3
+        self.padding = "valid"
+        self.stride = (1, 2, 1)
+        self.dilation = 1
+        self.groups = 4
+        self.no_bias = False
+        self.act = "sigmoid"
+        self.data_format = "NDHWC"
+
+
+class TestFunctionalConv3DTransposeCase7(TestFunctionalConv3DTranspose):
+    def setUp(self):
+        self.in_channels = 4
+        self.out_channels = 4
+        self.filter_shape = 3
+        self.padding = "valid"
+        self.output_size = (10, 17, 10)
+        self.stride = (1, 2, 1)
+        self.dilation = 1
+        self.groups = 1
+        self.no_bias = False
+        self.act = "sigmoid"
+        self.data_format = "NCDHW"
+
+
+class TestFunctionalConv3DTransposeCase8(TestFunctionalConv3DTranspose):
+    def setUp(self):
+        self.in_channels = 4
+        self.out_channels = 6
+        self.filter_shape = 3
+        self.padding = [[0, 0], [1, 2], [1, 2], [2, 1], [0, 0]]
+        self.stride = 1
+        self.dilation = 1
+        self.groups = 2
+        self.no_bias = False
+        self.act = "sigmoid"
+        self.data_format = "NDHWC"
+
+
+class TestFunctionalConv3DTransposeCase9(TestFunctionalConv3DTranspose):
+    def setUp(self):
+        self.in_channels = 4
+        self.out_channels = 6
+        self.filter_shape = 3
+        self.padding = [[0, 0], [0, 0], [1, 1], [1, 1], [2, 2]]
+        self.stride = 1
+        self.dilation = 1
+        self.groups = 2
+        self.no_bias = False
+        self.act = "sigmoid"
+        self.data_format = "NCDHW"
+
+
+class TestFunctionalConv3DTransposeCase10(TestFunctionalConv3DTranspose):
+    def setUp(self):
+        self.in_channels = 4
+        self.out_channels = 6
+        self.filter_shape = 3
+        self.padding = [1, 1, 2, 2, 1, 1]
+        self.stride = 1
+        self.dilation = 1
+        self.groups = 2
+        self.no_bias = False
+        self.act = "sigmoid"
+        self.data_format = "NCDHW"
+
+
+class TestFunctionalConv3DTransposeCase11(TestFunctionalConv3DTranspose):
+    def setUp(self):
+        self.in_channels = 4
+        self.out_channels = 6
+        self.filter_shape = 3
+        self.padding = [1, 2, 1]
+        self.stride = 1
+        self.dilation = 1
+        self.groups = 2
+        self.no_bias = False
+        self.act = "sigmoid"
+        self.data_format = "NCDHW"
+
+
+class TestFunctionalConv3DTransposeErrorCase10(TestCase):
+    def setUp(self):
+        self.input = np.array([])
+        self.filter = np.array([])
+        self.num_filters = 0
+        self.filter_size = 0
+        self.bias = None
+        self.padding = 0
+        self.stride = 1
+        self.dilation = 1
+        self.groups = 1
+        self.data_format = "NCDHW"
+
+    def static_graph_case(self):
+        main = base.Program()
+        start = base.Program()
+        with base.unique_name.guard():
+            with base.program_guard(main, start):
+                x = paddle.static.data(
+                    "input", self.input.shape, dtype=paddle.float32
+                )
+                y = paddle.static.nn.conv3d_transpose(
+                    x,
+                    self.num_filters,
+                    self.filter_size,
+                    stride=self.stride,
+                    padding=self.padding,
+                    dilation=self.dilation,
+                    groups=self.groups,
+                    param_attr=paddle.nn.initializer.Assign(self.filter),
+                    bias_attr=False
+                    if self.bias is None
+                    else paddle.nn.initializer.Assign(self.bias),
+                    act=None,
+                    data_format=self.data_format,
+                )
+        exe = base.Executor()
+        exe.run(start)
+        (out,) = exe.run(main, feed={"input": self.input}, fetch_list=[y])
+        return out
+
+    def test_static_exception(self):
+        with self.assertRaises(ValueError):
+            self.static_graph_case()
+
+
+class TestFunctionalConv3DTransposeErrorCase11(
+    TestFunctionalConv3DTransposeErrorCase10
+):
+    def setUp(self):
+        self.input = np.random.randn(1, 3, 3, 3, 3)
+        self.filter = np.random.randn(3, 3, 1, 1, 1)
+        self.num_filters = 3
+        self.filter_size = 1
+        self.bias = None
+        self.padding = 0
+        self.stride = 1
+        self.dilation = 1
+        self.groups = 0
+        self.data_format = "NCDHW"
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_fuse_bn_act_pass.py b/test/deprecated/legacy_test/test_fuse_bn_act_pass_deprecated.py
similarity index 99%
rename from test/deprecated/legacy_test/test_fuse_bn_act_pass.py
rename to test/deprecated/legacy_test/test_fuse_bn_act_pass_deprecated.py
index 6faaff59b51ef..958cfe70dcc0d 100644
--- a/test/deprecated/legacy_test/test_fuse_bn_act_pass.py
+++ b/test/deprecated/legacy_test/test_fuse_bn_act_pass_deprecated.py
@@ -17,6 +17,8 @@
 import paddle
 from paddle import base
 
+paddle.enable_static()
+
 
 class TestFuseBatchNormActPass(unittest.TestCase):
     def build_program(self, main_program, startup_program, use_cuda, seed=1):
diff --git a/test/deprecated/legacy_test/test_get_inputs_outputs_in_block.py b/test/deprecated/legacy_test/test_get_inputs_outputs_in_block_deprecated.py
similarity index 100%
rename from test/deprecated/legacy_test/test_get_inputs_outputs_in_block.py
rename to test/deprecated/legacy_test/test_get_inputs_outputs_in_block_deprecated.py
diff --git a/test/deprecated/legacy_test/test_gradient_clip.py b/test/deprecated/legacy_test/test_gradient_clip_deprecated.py
similarity index 100%
rename from test/deprecated/legacy_test/test_gradient_clip.py
rename to test/deprecated/legacy_test/test_gradient_clip_deprecated.py
diff --git a/test/deprecated/legacy_test/test_hsigmoid_op_deprecated.py b/test/deprecated/legacy_test/test_hsigmoid_op_deprecated.py
new file mode 100644
index 0000000000000..574bc03172a4f
--- /dev/null
+++ b/test/deprecated/legacy_test/test_hsigmoid_op_deprecated.py
@@ -0,0 +1,113 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle import base
+
+paddle.enable_static()
+np.random.seed(100)
+
+
+class TestHSigmoidOpWithSparseGrad(unittest.TestCase):
+    def hs_net_conf(self, is_sparse):
+        input_word = paddle.static.data(name="x", shape=[-1, 1], dtype='int64')
+        path_table = paddle.static.data(
+            name='path_table', shape=[-1, 3], dtype='int64'
+        )
+        path_code = paddle.static.data(
+            name='path_code', shape=[-1, 3], dtype='int64'
+        )
+        label = paddle.static.data(name='label', shape=[-1, 1], dtype='int64')
+
+        data_list = [input_word, path_table, path_code, label]
+
+        emb = paddle.static.nn.embedding(
+            input=input_word,
+            is_sparse=is_sparse,
+            size=[3, 3],
+            param_attr=base.ParamAttr(
+                initializer=paddle.nn.initializer.Normal(std=1 / math.sqrt(3))
+            ),
+        )
+
+        loss = paddle.nn.HSigmoidLoss(
+            feature_size=emb.shape[1],
+            num_classes=3,
+            bias_attr=True,
+            is_custom=True,
+            is_sparse=is_sparse,
+        )
+
+        cost = loss(
+            input=emb,
+            label=label,
+            path_table=path_table,
+            path_code=path_code,
+        )
+
+        avg_cost = paddle.mean(cost)
+
+        return avg_cost, data_list
+
+    def training_test(self, is_sparse):
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            paddle.seed(1)
+            start_up = paddle.static.default_startup_program()
+            x = np.arange(6).reshape(6)
+            path_table = np.array([(1, 2, -1), (1, 2, -1)]).astype('int64')
+            path_code = np.array([(1, 0, -1), (0, 0, -1)]).astype('int64')
+            label = np.array([1, 4]).astype('int64')
+
+            loss, data_list = self.hs_net_conf(is_sparse)
+            optimizer = paddle.optimizer.SGD(learning_rate=1e-3)
+            optimizer.minimize(loss)
+
+            main_program = paddle.static.default_main_program()
+            place = base.CPUPlace()
+            feeder = base.DataFeeder(feed_list=data_list, place=place)
+            exe = paddle.static.Executor(place)
+
+            exe.run(start_up)
+            result = []
+            for i in range(10):
+                data = [
+                    (
+                        [[x[i % 2]]],
+                        [list(path_table[i % 2])],
+                        [list(path_code[i % 2])],
+                        [label[i % 2]],
+                    )
+                ]
+
+                loss_val = exe.run(
+                    main_program, feed=feeder.feed(data), fetch_list=[loss]
+                )
+                result.append(loss_val)
+        return result
+
+    def test_hs_grad_with_sparse(self):
+        dense_result = self.training_test(is_sparse=False)
+        sparse_result = self.training_test(is_sparse=True)
+        assert dense_result == sparse_result
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_image_classification_layer.py b/test/deprecated/legacy_test/test_image_classification_layer.py
index cacffb437bad0..c786db344d47d 100644
--- a/test/deprecated/legacy_test/test_image_classification_layer.py
+++ b/test/deprecated/legacy_test/test_image_classification_layer.py
@@ -38,21 +38,6 @@ def conv_block(input, num_filter, groups, dropouts):
 
 
 class TestLayer(unittest.TestCase):
-    def test_batch_norm_layer(self):
-        main_program = Program()
-        startup_program = Program()
-        with base.program_guard(main_program, startup_program):
-            images = paddle.static.data(
-                name='pixel', shape=[-1, 3, 48, 48], dtype='float32'
-            )
-            hidden1 = paddle.static.nn.batch_norm(input=images)
-            hidden2 = paddle.static.nn.fc(
-                x=hidden1, size=128, activation='relu'
-            )
-            paddle.static.nn.batch_norm(input=hidden2)
-
-        print(str(main_program))
-
     def test_dropout_layer(self):
         main_program = Program()
         startup_program = Program()
diff --git a/test/deprecated/legacy_test/test_image_classification_layer_deprecated.py b/test/deprecated/legacy_test/test_image_classification_layer_deprecated.py
new file mode 100644
index 0000000000000..a977388a35283
--- /dev/null
+++ b/test/deprecated/legacy_test/test_image_classification_layer_deprecated.py
@@ -0,0 +1,60 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import unittest
+
+sys.path.append("../../legacy_test")
+import nets
+
+import paddle
+from paddle import base
+from paddle.base.framework import Program
+
+paddle.enable_static()
+
+
+def conv_block(input, num_filter, groups, dropouts):
+    return nets.img_conv_group(
+        input=input,
+        pool_size=2,
+        pool_stride=2,
+        conv_num_filter=[num_filter] * groups,
+        conv_filter_size=3,
+        conv_act='relu',
+        conv_with_batchnorm=True,
+        conv_batchnorm_drop_rate=dropouts,
+        pool_type='max',
+    )
+
+
+class TestLayer(unittest.TestCase):
+    def test_batch_norm_layer(self):
+        main_program = Program()
+        startup_program = Program()
+        with base.program_guard(main_program, startup_program):
+            images = paddle.static.data(
+                name='pixel', shape=[-1, 3, 48, 48], dtype='float32'
+            )
+            hidden1 = paddle.static.nn.batch_norm(input=images)
+            hidden2 = paddle.static.nn.fc(
+                x=hidden1, size=128, activation='relu'
+            )
+            paddle.static.nn.batch_norm(input=hidden2)
+
+        print(str(main_program))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_imperative_double_grad.py b/test/deprecated/legacy_test/test_imperative_double_grad.py
index eca85b0cbf58e..0baf4369b0716 100644
--- a/test/deprecated/legacy_test/test_imperative_double_grad.py
+++ b/test/deprecated/legacy_test/test_imperative_double_grad.py
@@ -567,21 +567,6 @@ def model_f(input):
         np.testing.assert_array_equal(grad_1, grad_2)
 
 
-class TestRaiseNoDoubleGradOp(TestCase):
-    def test_no_grad_op(self):
-        with base.dygraph.guard():
-            x = paddle.ones(shape=[2, 3, 2, 2], dtype='float32')
-            x.stop_gradient = False
-            y = paddle.static.nn.group_norm(x, groups=1)
-
-            dx = base.dygraph.grad(
-                outputs=[y], inputs=[x], create_graph=True, retain_graph=True
-            )[0]
-
-            loss = paddle.mean(dx)
-            loss.backward()
-
-
 class TestDoubleGradResNet(TestCase):
     def setUp(self):
         paddle.seed(123)
diff --git a/test/deprecated/legacy_test/test_imperative_double_grad_deprecated.py b/test/deprecated/legacy_test/test_imperative_double_grad_deprecated.py
new file mode 100644
index 0000000000000..9fda4f4d3dc1f
--- /dev/null
+++ b/test/deprecated/legacy_test/test_imperative_double_grad_deprecated.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from unittest import TestCase
+
+import paddle
+from paddle import base
+
+
+class TestRaiseNoDoubleGradOp(TestCase):
+    def test_no_grad_op(self):
+        with base.dygraph.guard():
+            x = paddle.ones(shape=[2, 3, 2, 2], dtype='float32')
+            x.stop_gradient = False
+            y = paddle.static.nn.group_norm(x, groups=1)
+
+            dx = base.dygraph.grad(
+                outputs=[y], inputs=[x], create_graph=True, retain_graph=True
+            )[0]
+
+            loss = paddle.mean(dx)
+            loss.backward()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_imperative_load_static_param.py b/test/deprecated/legacy_test/test_imperative_load_static_param_deprecated.py
similarity index 100%
rename from test/deprecated/legacy_test/test_imperative_load_static_param.py
rename to test/deprecated/legacy_test/test_imperative_load_static_param_deprecated.py
diff --git a/test/deprecated/legacy_test/test_imperative_lod_tensor_to_selected_rows.py b/test/deprecated/legacy_test/test_imperative_lod_tensor_to_selected_rows_deprecated.py
similarity index 100%
rename from test/deprecated/legacy_test/test_imperative_lod_tensor_to_selected_rows.py
rename to test/deprecated/legacy_test/test_imperative_lod_tensor_to_selected_rows_deprecated.py
diff --git a/test/deprecated/legacy_test/test_infer_no_need_buffer_slots.py b/test/deprecated/legacy_test/test_infer_no_need_buffer_slots_deprecated.py
similarity index 99%
rename from test/deprecated/legacy_test/test_infer_no_need_buffer_slots.py
rename to test/deprecated/legacy_test/test_infer_no_need_buffer_slots_deprecated.py
index 1ba17a9270c50..d1286cbd02aba 100644
--- a/test/deprecated/legacy_test/test_infer_no_need_buffer_slots.py
+++ b/test/deprecated/legacy_test/test_infer_no_need_buffer_slots_deprecated.py
@@ -18,6 +18,8 @@
 from paddle import base
 from paddle.base import core, framework
 
+paddle.enable_static()
+
 
 class TestInferNoNeedBufferSlots(unittest.TestCase):
     def net(self):
diff --git a/test/deprecated/legacy_test/test_inference_api.py b/test/deprecated/legacy_test/test_inference_api_deprecated.py
similarity index 81%
rename from test/deprecated/legacy_test/test_inference_api.py
rename to test/deprecated/legacy_test/test_inference_api_deprecated.py
index b6f5456ee4796..aba8f4cf82b86 100644
--- a/test/deprecated/legacy_test/test_inference_api.py
+++ b/test/deprecated/legacy_test/test_inference_api_deprecated.py
@@ -20,7 +20,6 @@
 import numpy as np
 
 from paddle import base
-from paddle.base.core import PaddleDType, PaddleTensor
 from paddle.framework import core
 from paddle.inference import (
     Config,
@@ -30,49 +29,6 @@
 )
 
 
-class TestInferenceApi(unittest.TestCase):
-    def test_inference_api(self):
-        tensor32 = np.random.randint(10, 20, size=[20, 2]).astype('int32')
-        paddletensor32 = PaddleTensor(tensor32)
-        dtype32 = paddletensor32.dtype
-        self.assertEqual(dtype32, PaddleDType.INT32)
-        self.assertEqual(
-            paddletensor32.data.tolist('int32'), tensor32.ravel().tolist()
-        )
-        paddletensor32.data.reset(tensor32)
-        self.assertEqual(
-            paddletensor32.as_ndarray().ravel().tolist(),
-            tensor32.ravel().tolist(),
-        )
-
-        tensor64 = np.random.randint(10, 20, size=[20, 2]).astype('int64')
-        paddletensor64 = PaddleTensor(tensor64)
-        dtype64 = paddletensor64.dtype
-        self.assertEqual(dtype64, PaddleDType.INT64)
-        self.assertEqual(
-            paddletensor64.data.tolist('int64'), tensor64.ravel().tolist()
-        )
-        paddletensor64.data.reset(tensor64)
-        self.assertEqual(
-            paddletensor64.as_ndarray().ravel().tolist(),
-            tensor64.ravel().tolist(),
-        )
-
-        tensor_float = np.random.randn(20, 2).astype('float32')
-        paddletensor_float = PaddleTensor(tensor_float)
-        dtype_float = paddletensor_float.dtype
-        self.assertEqual(dtype_float, PaddleDType.FLOAT32)
-        self.assertEqual(
-            paddletensor_float.data.tolist('float32'),
-            tensor_float.ravel().tolist(),
-        )
-        paddletensor_float.data.reset(tensor_float)
-        self.assertEqual(
-            paddletensor_float.as_ndarray().ravel().tolist(),
-            tensor_float.ravel().tolist(),
-        )
-
-
 def get_sample_model():
     place = base.CPUPlace()
     exe = base.Executor(place)
diff --git a/test/deprecated/legacy_test/test_inference_model_io.py b/test/deprecated/legacy_test/test_inference_model_io_deprecated.py
similarity index 92%
rename from test/deprecated/legacy_test/test_inference_model_io.py
rename to test/deprecated/legacy_test/test_inference_model_io_deprecated.py
index 2e179cf90276e..c01bd2d92d9d0 100644
--- a/test/deprecated/legacy_test/test_inference_model_io.py
+++ b/test/deprecated/legacy_test/test_inference_model_io_deprecated.py
@@ -29,6 +29,7 @@
     load_inference_model_distributed,
     save_persistables,
 )
+from paddle.pir_utils import test_with_pir_api
 from paddle.static.io import load_inference_model, save_inference_model
 
 paddle.enable_static()
@@ -161,14 +162,15 @@ def test_fit_line_inference_model(self):
 
 
 class TestSaveInferenceModel(unittest.TestCase):
+    @test_with_pir_api
     def test_save_inference_model(self):
         root_path = tempfile.TemporaryDirectory()
         MODEL_DIR = os.path.join(root_path.name, "inference_model2")
-        init_program = Program()
-        program = Program()
+        init_program = paddle.static.Program()
+        program = paddle.static.Program()
 
         # fake program without feed/fetch
-        with program_guard(program, init_program):
+        with paddle.static.program_guard(program, init_program):
             x = paddle.static.data(name='x', shape=[-1, 2], dtype='float32')
             y = paddle.static.data(name='y', shape=[-1, 1], dtype='float32')
 
@@ -188,14 +190,15 @@ def test_save_inference_model(self):
         )
         root_path.cleanup()
 
+    @test_with_pir_api
     def test_save_inference_model_with_auc(self):
         root_path = tempfile.TemporaryDirectory()
         MODEL_DIR = os.path.join(root_path.name, "inference_model4")
-        init_program = Program()
-        program = Program()
+        init_program = paddle.static.Program()
+        program = paddle.static.Program()
 
         # fake program without feed/fetch
-        with program_guard(program, init_program):
+        with paddle.static.program_guard(program, init_program):
             x = paddle.static.data(name='x', shape=[-1, 2], dtype='float32')
             y = paddle.static.data(name='y', shape=[-1, 1], dtype='int32')
             predict = paddle.static.nn.fc(x, size=2, activation='softmax')
@@ -223,14 +226,15 @@ def test_save_inference_model_with_auc(self):
 
 
 class TestInstance(unittest.TestCase):
+    # @test_with_pir_api
     def test_save_inference_model(self):
         root_path = tempfile.TemporaryDirectory()
         MODEL_DIR = os.path.join(root_path.name, "inference_model3")
-        init_program = Program()
-        program = Program()
+        init_program = paddle.static.Program()
+        program = paddle.static.Program()
 
         # fake program without feed/fetch
-        with program_guard(program, init_program):
+        with paddle.static.program_guard(program, init_program):
             x = paddle.static.data(name='x', shape=[-1, 2], dtype='float32')
             y = paddle.static.data(name='y', shape=[-1, 1], dtype='float32')
 
@@ -261,14 +265,15 @@ def test_save_inference_model(self):
 
 
 class TestSaveInferenceModelNew(unittest.TestCase):
+    # @test_with_pir_api
     def test_save_and_load_inference_model(self):
         root_path = tempfile.TemporaryDirectory()
         MODEL_DIR = os.path.join(root_path.name, "inference_model5")
-        init_program = base.default_startup_program()
-        program = base.default_main_program()
+        init_program = paddle.static.default_startup_program()
+        program = paddle.static.default_main_program()
 
         # fake program without feed/fetch
-        with program_guard(program, init_program):
+        with paddle.static.program_guard(program, init_program):
             x = paddle.static.data(name='x', shape=[-1, 2], dtype='float32')
             y = paddle.static.data(name='y', shape=[-1, 1], dtype='float32')
 
@@ -283,7 +288,7 @@ def test_save_and_load_inference_model(self):
             sgd_optimizer.minimize(avg_cost, init_program)
 
         place = core.CPUPlace()
-        exe = executor.Executor(place)
+        exe = base.Executor(place)
         exe.run(init_program, feed={}, fetch_list=[])
 
         tensor_x = np.array([[1, 1], [1, 2], [5, 2]]).astype("float32")
@@ -344,7 +349,12 @@ def test_save_and_load_inference_model(self):
             exe,
         )
 
-        model_path = MODEL_DIR + "_isdir.pdmodel"
+        if paddle.framework.in_pir_mode():
+            MODEL_SUFFIX = ".json"
+        else:
+            MODEL_SUFFIX = ".pdmodel"
+
+        model_path = MODEL_DIR + "_isdir" + MODEL_SUFFIX
         os.makedirs(model_path)
         self.assertRaises(
             ValueError,
@@ -356,7 +366,7 @@ def test_save_and_load_inference_model(self):
         )
         os.rmdir(model_path)
 
-        params_path = MODEL_DIR + "_isdir.pdmodel"
+        params_path = MODEL_DIR + "_isdir" + MODEL_SUFFIX
         os.makedirs(params_path)
         self.assertRaises(
             ValueError,
@@ -372,7 +382,7 @@ def test_save_and_load_inference_model(self):
             MODEL_DIR, [x, y], [avg_cost], exe
         )
 
-        self.assertTrue(os.path.exists(MODEL_DIR + ".pdmodel"))
+        self.assertTrue(os.path.exists(MODEL_DIR + MODEL_SUFFIX))
         self.assertTrue(os.path.exists(MODEL_DIR + ".pdiparams"))
 
         expected = exe.run(
@@ -405,7 +415,7 @@ def test_save_and_load_inference_model(self):
             unsupported_param=None,
         )
         self.assertRaises(
-            (TypeError, ValueError),
+            (TypeError, RuntimeError, ValueError),
             paddle.static.load_inference_model,
             None,
             exe,
@@ -435,7 +445,7 @@ def test_save_and_load_inference_model(self):
         self.assertRaises(ValueError, paddle.static.io.save_to_file, '', 123)
         # test _get_valid_program
         self.assertRaises(TypeError, paddle.static.io._get_valid_program, 0)
-        p = Program()
+        p = paddle.static.Program()
         cp = CompiledProgram(p)
         paddle.static.io._get_valid_program(cp)
         self.assertTrue(paddle.static.io._get_valid_program(cp) is p)
@@ -491,12 +501,13 @@ def test_serialize_program_and_persistables(self):
             None,
         )
 
+    @test_with_pir_api
     def test_normalize_program(self):
-        init_program = base.default_startup_program()
-        program = base.default_main_program()
+        init_program = paddle.static.default_startup_program()
+        program = paddle.static.default_main_program()
 
         # fake program without feed/fetch
-        with program_guard(program, init_program):
+        with paddle.static.program_guard(program, init_program):
             x = paddle.static.data(name='x', shape=[-1, 2], dtype='float32')
             y = paddle.static.data(name='y', shape=[-1, 1], dtype='float32')
 
@@ -525,7 +536,7 @@ def test_normalize_program(self):
 
         # test if return type of serialize_program is bytes
         res = paddle.static.normalize_program(program, [x, y], [avg_cost])
-        self.assertTrue(isinstance(res, Program))
+        self.assertTrue(isinstance(res, paddle.static.Program))
         # test program type
         self.assertRaises(
             TypeError, paddle.static.normalize_program, None, [x, y], [avg_cost]
@@ -544,20 +555,5 @@ def test_normalize_program(self):
         )
 
 
-class TestLoadInferenceModelError(unittest.TestCase):
-    def test_load_model_not_exist(self):
-        place = core.CPUPlace()
-        exe = executor.Executor(place)
-        self.assertRaises(
-            ValueError, load_inference_model, './test_not_exist_dir/model', exe
-        )
-        self.assertRaises(
-            ValueError,
-            load_inference_model_distributed,
-            './test_not_exist_dir',
-            exe,
-        )
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/deprecated/legacy_test/test_initializer.py b/test/deprecated/legacy_test/test_initializer.py
index 5910a9c4297e0..c55940afe5903 100644
--- a/test/deprecated/legacy_test/test_initializer.py
+++ b/test/deprecated/legacy_test/test_initializer.py
@@ -20,7 +20,6 @@
 from utils import dygraph_guard, static_guard
 
 import paddle
-from paddle import base
 from paddle.base import framework
 from paddle.base.core import VarDesc
 from paddle.pir_utils import test_with_pir_api
@@ -1448,65 +1447,6 @@ def test_numpy_array_initializer_bf16(self):
         self.assertTrue(check_cast_op_pir(cast_op))
 
 
-class TestSetGlobalInitializer(unittest.TestCase):
-    def test_set_global_weight_initializer(self):
-        """Test Set Global Param initializer with UniformInitializer"""
-        main_prog = framework.Program()
-        startup_prog = framework.Program()
-        base.set_global_initializer(
-            paddle.nn.initializer.Uniform(low=-0.5, high=0.5)
-        )
-        with base.program_guard(main_prog, startup_prog):
-            x = paddle.static.data(name="x", shape=[1, 3, 32, 32])
-            # default initializer of param in layers.conv2d is NormalInitializer
-            conv = paddle.static.nn.conv2d(x, 5, 3)
-
-        block = startup_prog.global_block()
-        self.assertEqual(len(block.ops), 2)
-
-        # init weight is the first op, and bias is the second
-        bias_init_op = block.ops[1]
-        self.assertEqual(bias_init_op.type, 'fill_constant')
-        self.assertAlmostEqual(bias_init_op.attr('value'), 0.0, delta=DELTA)
-
-        param_init_op = block.ops[0]
-        self.assertEqual(param_init_op.type, 'uniform_random')
-        self.assertAlmostEqual(param_init_op.attr('min'), -0.5, delta=DELTA)
-        self.assertAlmostEqual(param_init_op.attr('max'), 0.5, delta=DELTA)
-        self.assertEqual(param_init_op.attr('seed'), 0)
-        base.set_global_initializer(None)
-
-    def test_set_global_bias_initializer(self):
-        """Test Set Global Bias initializer with NormalInitializer"""
-        main_prog = framework.Program()
-        startup_prog = framework.Program()
-        base.set_global_initializer(
-            paddle.nn.initializer.Uniform(low=-0.5, high=0.5),
-            bias_init=paddle.nn.initializer.Normal(0.0, 2.0),
-        )
-        with base.program_guard(main_prog, startup_prog):
-            x = paddle.static.data(name="x", shape=[1, 3, 32, 32])
-            # default initializer of bias in layers.conv2d is ConstantInitializer
-            conv = paddle.static.nn.conv2d(x, 5, 3)
-
-        block = startup_prog.global_block()
-        self.assertEqual(len(block.ops), 2)
-
-        # init weight is the first op, and bias is the second
-        bias_init_op = block.ops[1]
-        self.assertEqual(bias_init_op.type, 'gaussian_random')
-        self.assertAlmostEqual(bias_init_op.attr('mean'), 0.0, delta=DELTA)
-        self.assertAlmostEqual(bias_init_op.attr('std'), 2.0, delta=DELTA)
-        self.assertEqual(bias_init_op.attr('seed'), 0)
-
-        param_init_op = block.ops[0]
-        self.assertEqual(param_init_op.type, 'uniform_random')
-        self.assertAlmostEqual(param_init_op.attr('min'), -0.5, delta=DELTA)
-        self.assertAlmostEqual(param_init_op.attr('max'), 0.5, delta=DELTA)
-        self.assertEqual(param_init_op.attr('seed'), 0)
-        base.set_global_initializer(None)
-
-
 class TestUniformInitializerDygraph(unittest.TestCase):
     def test_uniform_initializer(self, dtype="float32"):
         """
@@ -2192,22 +2132,6 @@ def test_error(self):
             paddle.nn.Conv2D(5, 9, (3, 3), weight_attr=self.weight_attr)
 
 
-class TestKaimingUniform(unittest.TestCase):
-    def func_kaiminguniform_initializer_fan_in_zero(self):
-        paddle.enable_static()
-        x = paddle.static.data(name='x', shape=[1, 0, 0], dtype='float32')
-
-        kaiming = paddle.nn.initializer.KaimingUniform(0)
-        param_attr = paddle.ParamAttr(initializer=kaiming)
-
-        paddle.static.nn.prelu(x, 'all', param_attr=param_attr)
-
-    def test_type_error(self):
-        self.assertRaises(
-            ZeroDivisionError, self.func_kaiminguniform_initializer_fan_in_zero
-        )
-
-
 class TestTruncatedNormalInitializerDygraph(unittest.TestCase):
     def _trunc_normal_numpy(self, tensor, mean=0.0, std=1.0, a=-2.0, b=2.0):
         # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
diff --git a/test/deprecated/legacy_test/test_initializer_deprecated.py b/test/deprecated/legacy_test/test_initializer_deprecated.py
new file mode 100644
index 0000000000000..75473cee68b7a
--- /dev/null
+++ b/test/deprecated/legacy_test/test_initializer_deprecated.py
@@ -0,0 +1,101 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle
+from paddle import base
+from paddle.base import framework
+
+DELTA = 0.00001
+
+
+class TestSetGlobalInitializer(unittest.TestCase):
+    def test_set_global_weight_initializer(self):
+        """Test Set Global Param initializer with UniformInitializer"""
+        main_prog = framework.Program()
+        startup_prog = framework.Program()
+        base.set_global_initializer(
+            paddle.nn.initializer.Uniform(low=-0.5, high=0.5)
+        )
+        with base.program_guard(main_prog, startup_prog):
+            x = paddle.static.data(name="x", shape=[1, 3, 32, 32])
+            # default initializer of param in layers.conv2d is NormalInitializer
+            conv = paddle.static.nn.conv2d(x, 5, 3)
+
+        block = startup_prog.global_block()
+        self.assertEqual(len(block.ops), 2)
+
+        # init weight is the first op, and bias is the second
+        bias_init_op = block.ops[1]
+        self.assertEqual(bias_init_op.type, 'fill_constant')
+        self.assertAlmostEqual(bias_init_op.attr('value'), 0.0, delta=DELTA)
+
+        param_init_op = block.ops[0]
+        self.assertEqual(param_init_op.type, 'uniform_random')
+        self.assertAlmostEqual(param_init_op.attr('min'), -0.5, delta=DELTA)
+        self.assertAlmostEqual(param_init_op.attr('max'), 0.5, delta=DELTA)
+        self.assertEqual(param_init_op.attr('seed'), 0)
+        base.set_global_initializer(None)
+
+    def test_set_global_bias_initializer(self):
+        """Test Set Global Bias initializer with NormalInitializer"""
+        main_prog = framework.Program()
+        startup_prog = framework.Program()
+        base.set_global_initializer(
+            paddle.nn.initializer.Uniform(low=-0.5, high=0.5),
+            bias_init=paddle.nn.initializer.Normal(0.0, 2.0),
+        )
+        with base.program_guard(main_prog, startup_prog):
+            x = paddle.static.data(name="x", shape=[1, 3, 32, 32])
+            # default initializer of bias in layers.conv2d is ConstantInitializer
+            conv = paddle.static.nn.conv2d(x, 5, 3)
+
+        block = startup_prog.global_block()
+        self.assertEqual(len(block.ops), 2)
+
+        # init weight is the first op, and bias is the second
+        bias_init_op = block.ops[1]
+        self.assertEqual(bias_init_op.type, 'gaussian_random')
+        self.assertAlmostEqual(bias_init_op.attr('mean'), 0.0, delta=DELTA)
+        self.assertAlmostEqual(bias_init_op.attr('std'), 2.0, delta=DELTA)
+        self.assertEqual(bias_init_op.attr('seed'), 0)
+
+        param_init_op = block.ops[0]
+        self.assertEqual(param_init_op.type, 'uniform_random')
+        self.assertAlmostEqual(param_init_op.attr('min'), -0.5, delta=DELTA)
+        self.assertAlmostEqual(param_init_op.attr('max'), 0.5, delta=DELTA)
+        self.assertEqual(param_init_op.attr('seed'), 0)
+        base.set_global_initializer(None)
+
+
+class TestKaimingUniform(unittest.TestCase):
+    def func_kaiminguniform_initializer_fan_in_zero(self):
+        paddle.enable_static()
+        x = paddle.static.data(name='x', shape=[1, 0, 0], dtype='float32')
+
+        kaiming = paddle.nn.initializer.KaimingUniform(0)
+        param_attr = paddle.ParamAttr(initializer=kaiming)
+
+        paddle.static.nn.prelu(x, 'all', param_attr=param_attr)
+
+    def test_type_error(self):
+        self.assertRaises(
+            ZeroDivisionError, self.func_kaiminguniform_initializer_fan_in_zero
+        )
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_instance_norm_op.py b/test/deprecated/legacy_test/test_instance_norm_op.py
index 2e9f9855d1033..b266e67dfd334 100644
--- a/test/deprecated/legacy_test/test_instance_norm_op.py
+++ b/test/deprecated/legacy_test/test_instance_norm_op.py
@@ -21,7 +21,7 @@
 
 import paddle
 from paddle import base, nn
-from paddle.base import Program, core, program_guard
+from paddle.base import core
 
 
 def _reference_instance_norm_naive(x, scale, bias, epsilon, mean, var):
@@ -722,184 +722,6 @@ def init_test_case(self):
         )
 
 
-class TestInstanceNormOpTraining(unittest.TestCase):
-    def setUp(self):
-        self.epsilon = 1e-5
-        self.init_test_case()
-
-    def init_test_case(self):
-        self.shape = [2, 3, 4, 5]
-        self.no_grad_set = set()
-        self.fetch_list = [
-            'y',
-            'saved_mean',
-            'saved_variance',
-            'x@GRAD',
-            'scale@GRAD',
-            'bias@GRAD',
-        ]
-
-    def __assert_close(self, tensor, np_array, msg, atol=1e-4):
-        np.testing.assert_allclose(
-            np.array(tensor), np_array, rtol=1e-05, atol=atol, err_msg=msg
-        )
-
-    def set_global_mean_var(self, mean_shape, x):
-        mean, variance = _cal_mean_variance(x, self.epsilon, mean_shape)
-        return mean, variance
-
-    def test_forward_backward(self):
-        def test_with_place(place, shape):
-            paddle.enable_static()
-            epsilon = self.epsilon
-            n, c, h, w = shape[0], shape[1], shape[2], shape[3]
-            scale_shape = [c]
-            mean_shape = [n * c]
-
-            np.random.seed()
-            x = np.random.random_sample(shape).astype(np.float32)
-            scale = np.random.random_sample(scale_shape).astype(np.float32)
-            bias = np.random.random_sample(scale_shape).astype(np.float32)
-            mean, variance = self.set_global_mean_var(mean_shape, x)
-            d_y = np.random.random_sample(shape).astype(np.float32)
-
-            y, saved_mean, variance_tmp = _reference_instance_norm_naive(
-                x, scale, bias, epsilon, mean, variance
-            )
-
-            saved_variance = 1 / np.sqrt(variance_tmp + epsilon)
-
-            d_x, d_scale, d_bias = _reference_instance_norm_grad(
-                x, d_y, scale, saved_mean, saved_variance, epsilon
-            )
-
-            var_dict = locals()
-            var_dict['y@GRAD'] = d_y
-            var_dict['x@GRAD'] = d_x
-            var_dict['scale@GRAD'] = d_scale
-            var_dict['bias@GRAD'] = d_bias
-
-            var_names = [
-                'x',
-                'scale',
-                'bias',
-                'y',
-                'saved_mean',
-                'saved_variance',
-            ]
-            ground_truth = {name: var_dict[name] for name in var_names}
-
-            program = base.Program()
-            with base.program_guard(program):
-                block = program.global_block()
-                for name in ground_truth:
-                    block.create_var(
-                        name=name,
-                        dtype='float32',
-                        shape=ground_truth[name].shape,
-                    )
-                in_op = block.append_op(
-                    type="instance_norm",
-                    inputs={
-                        "X": block.var("x"),
-                        "Scale": block.var("scale"),
-                        "Bias": block.var("bias"),
-                    },
-                    outputs={
-                        "Y": block.var("y"),
-                        "SavedMean": block.var("saved_mean"),
-                        "SavedVariance": block.var("saved_variance"),
-                    },
-                    attrs={
-                        "epsilon": epsilon,
-                    },
-                )
-
-                block.create_var(name="y@GRAD", dtype='float32', shape=y.shape)
-
-                grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
-                    in_op.desc, self.no_grad_set, []
-                )
-                grad_op_desc = grad_op_desc_list[0]
-                new_op_desc = block.desc.append_op()
-                new_op_desc.copy_from(grad_op_desc)
-                for var_name in grad_op_desc.output_arg_names():
-                    block.desc.var(var_name.encode("ascii"))
-                grad_op_desc.infer_var_type(block.desc)
-                grad_op_desc.infer_shape(block.desc)
-                for arg in grad_op_desc.output_arg_names():
-                    grad_var = block.desc.find_var(arg.encode("ascii"))
-                    grad_var.set_dtype(core.VarDesc.VarType.FP32)
-
-                program._sync_with_cpp()
-
-                exe = base.Executor(place)
-                out = exe.run(
-                    program,
-                    feed={
-                        name: var_dict[name]
-                        for name in ['x', 'scale', 'bias', 'y@GRAD']
-                    },
-                    fetch_list=self.fetch_list,
-                )
-
-            for id, name in enumerate(self.fetch_list):
-                self.__assert_close(var_dict[name], out[id], name)
-            print("op test forward passes: ", str(place))
-            paddle.disable_static()
-
-        places = [core.CPUPlace()]
-
-        if core.is_compiled_with_cuda() and core.op_support_gpu(
-            "instance_norm"
-        ):
-            places.append(core.CUDAPlace(0))
-        for place in places:
-            test_with_place(place, self.shape)
-
-
-class TestInstanceNormOpTrainingCase1(TestInstanceNormOpTraining):
-    def init_test_case(self):
-        self.shape = [2, 3, 4, 5]
-        self.no_grad_set = {'scale@GRAD', 'bias@GRAD'}
-        self.fetch_list = ['y', 'saved_mean', 'saved_variance', 'x@GRAD']
-
-
-class TestInstanceNormOpTrainingCase2(TestInstanceNormOpTraining):
-    def init_test_case(self):
-        self.shape = [20, 50, 4, 5]
-        self.no_grad_set = {'scale@GRAD', 'bias@GRAD'}
-        self.fetch_list = ['y', 'saved_mean', 'saved_variance', 'x@GRAD']
-
-
-class TestInstanceNormOpError(unittest.TestCase):
-    def test_errors(self):
-        paddle.enable_static()
-        with program_guard(Program(), Program()):
-            # the input of instance_norm must be Variable.
-            x1 = base.create_lod_tensor(
-                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], base.CPUPlace()
-            )
-            self.assertRaises(TypeError, paddle.static.nn.instance_norm, x1)
-
-            # the input dtype of instance_norm must be float32 or float64
-            x2 = paddle.static.data(
-                name='x2', shape=[-1, 3, 4, 5, 6], dtype="int32"
-            )
-            self.assertRaises(TypeError, paddle.static.nn.instance_norm, x2)
-        paddle.disable_static()
-
-
-class TestInstanceNormOpErrorCase1(unittest.TestCase):
-    def test_errors(self):
-        paddle.enable_static()
-        with program_guard(Program(), Program()):
-            # the first dimension of input for instance_norm must between [2d, 5d]
-            x = paddle.static.data(name='x', shape=[3], dtype="float32")
-            self.assertRaises(ValueError, paddle.static.nn.instance_norm, x)
-        paddle.disable_static()
-
-
 class TestElasticNormOp(unittest.TestCase):
     def init_test_case(self):
         self.epsilon = 1e-5
diff --git a/test/deprecated/legacy_test/test_instance_norm_op_deprecated.py b/test/deprecated/legacy_test/test_instance_norm_op_deprecated.py
new file mode 100644
index 0000000000000..cc8e56b8be5e8
--- /dev/null
+++ b/test/deprecated/legacy_test/test_instance_norm_op_deprecated.py
@@ -0,0 +1,271 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle import base
+from paddle.base import Program, core, program_guard
+
+
+def _reference_instance_norm_naive(x, scale, bias, epsilon, mean, var):
+    x_shape = x.shape
+    if len(x_shape) == 2:
+        x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1))
+    n, c, h, w = x.shape
+
+    mean_tile = np.reshape(mean, (n, c, 1, 1))
+    mean_tile = np.tile(mean_tile, (1, 1, h, w))
+    var_tile = np.reshape(var, (n, c, 1, 1))
+    var_tile = np.tile(var_tile, (1, 1, h, w))
+
+    x_norm = (x - mean_tile) / np.sqrt(var_tile + epsilon)
+    scale_tile = np.reshape(scale, (1, c, 1, 1))
+    scale_tile = np.tile(scale_tile, (n, 1, h, w))
+    bias_tile = np.reshape(bias, (1, c, 1, 1))
+    bias_tile = np.tile(bias_tile, (n, 1, h, w))
+    y = scale_tile * x_norm + bias_tile
+    if len(x_shape) == 2:
+        y = np.reshape(y, x_shape)
+    return y, mean, var
+
+
+def _reference_instance_norm_grad(x, d_y, scale, mean, var, epsilon):
+    # d_scale = sum(d_y * (x-mean) / sqrt(var+epsilon))
+    # d_offset = sum(d_y)
+    # d_x = scale / sqrt(var+epsilon) * (d_y - np.mean(d_y, axis=(2,3)) - (x-mean)/sqrt(var+epsilon)* np.mean(y_grad * (x-mean)/sqrt(var+epsilon), axis=(2,3)))
+    n, c, h, w = x.shape
+
+    d_bias = np.sum(d_y, axis=(0, 2, 3))
+
+    mean_tile = np.reshape(mean, (n, c, 1, 1))
+    mean_tile = np.tile(mean_tile, (1, 1, h, w))
+    var_tile = np.reshape(var, (n, c, 1, 1))
+    var_tile = np.tile(var_tile, (1, 1, h, w))
+
+    d_scale = np.sum(d_y * (x - mean_tile) * var_tile, axis=(0, 2, 3))
+    var_inv = var_tile
+    scale_tile = np.reshape(scale, (1, c, 1, 1))
+    scale_tile = np.tile(scale_tile, (n, 1, h, w))
+
+    d_x = (
+        scale_tile
+        * var_inv
+        * (
+            d_y
+            - np.mean(d_y, axis=(2, 3), keepdims=True)
+            - (x - mean_tile)
+            * var_inv
+            * np.mean(
+                d_y * (x - mean_tile) * var_inv, axis=(2, 3), keepdims=True
+            )
+        )
+    )
+    return d_x, d_scale, d_bias
+
+
+def _cal_mean_variance(x, epsilon, mean_shape):
+    mean = np.reshape(np.mean(x, axis=(2, 3)), mean_shape)
+    var = np.reshape(np.var(x, axis=(2, 3)), mean_shape)
+    return mean, var
+
+
+def instance_norm_wrapper(x, weight=None, bias=None, esp=1e-05):
+    return paddle.nn.functional.instance_norm(
+        x, None, None, weight, bias, True, 0.9, esp
+    )
+
+
+class TestInstanceNormOpTraining(unittest.TestCase):
+    def setUp(self):
+        self.epsilon = 1e-5
+        self.init_test_case()
+
+    def init_test_case(self):
+        self.shape = [2, 3, 4, 5]
+        self.no_grad_set = set()
+        self.fetch_list = [
+            'y',
+            'saved_mean',
+            'saved_variance',
+            'x@GRAD',
+            'scale@GRAD',
+            'bias@GRAD',
+        ]
+
+    def __assert_close(self, tensor, np_array, msg, atol=1e-4):
+        np.testing.assert_allclose(
+            np.array(tensor), np_array, rtol=1e-05, atol=atol, err_msg=msg
+        )
+
+    def set_global_mean_var(self, mean_shape, x):
+        mean, variance = _cal_mean_variance(x, self.epsilon, mean_shape)
+        return mean, variance
+
+    def test_forward_backward(self):
+        def test_with_place(place, shape):
+            paddle.enable_static()
+            epsilon = self.epsilon
+            n, c, h, w = shape[0], shape[1], shape[2], shape[3]
+            scale_shape = [c]
+            mean_shape = [n * c]
+
+            np.random.seed()
+            x = np.random.random_sample(shape).astype(np.float32)
+            scale = np.random.random_sample(scale_shape).astype(np.float32)
+            bias = np.random.random_sample(scale_shape).astype(np.float32)
+            mean, variance = self.set_global_mean_var(mean_shape, x)
+            d_y = np.random.random_sample(shape).astype(np.float32)
+
+            y, saved_mean, variance_tmp = _reference_instance_norm_naive(
+                x, scale, bias, epsilon, mean, variance
+            )
+
+            saved_variance = 1 / np.sqrt(variance_tmp + epsilon)
+
+            d_x, d_scale, d_bias = _reference_instance_norm_grad(
+                x, d_y, scale, saved_mean, saved_variance, epsilon
+            )
+
+            var_dict = locals()
+            var_dict['y@GRAD'] = d_y
+            var_dict['x@GRAD'] = d_x
+            var_dict['scale@GRAD'] = d_scale
+            var_dict['bias@GRAD'] = d_bias
+
+            var_names = [
+                'x',
+                'scale',
+                'bias',
+                'y',
+                'saved_mean',
+                'saved_variance',
+            ]
+            ground_truth = {name: var_dict[name] for name in var_names}
+
+            program = base.Program()
+            with base.program_guard(program):
+                block = program.global_block()
+                for name in ground_truth:
+                    block.create_var(
+                        name=name,
+                        dtype='float32',
+                        shape=ground_truth[name].shape,
+                    )
+                in_op = block.append_op(
+                    type="instance_norm",
+                    inputs={
+                        "X": block.var("x"),
+                        "Scale": block.var("scale"),
+                        "Bias": block.var("bias"),
+                    },
+                    outputs={
+                        "Y": block.var("y"),
+                        "SavedMean": block.var("saved_mean"),
+                        "SavedVariance": block.var("saved_variance"),
+                    },
+                    attrs={
+                        "epsilon": epsilon,
+                    },
+                )
+
+                block.create_var(name="y@GRAD", dtype='float32', shape=y.shape)
+
+                grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
+                    in_op.desc, self.no_grad_set, []
+                )
+                grad_op_desc = grad_op_desc_list[0]
+                new_op_desc = block.desc.append_op()
+                new_op_desc.copy_from(grad_op_desc)
+                for var_name in grad_op_desc.output_arg_names():
+                    block.desc.var(var_name.encode("ascii"))
+                grad_op_desc.infer_var_type(block.desc)
+                grad_op_desc.infer_shape(block.desc)
+                for arg in grad_op_desc.output_arg_names():
+                    grad_var = block.desc.find_var(arg.encode("ascii"))
+                    grad_var.set_dtype(core.VarDesc.VarType.FP32)
+
+                program._sync_with_cpp()
+
+                exe = base.Executor(place)
+                out = exe.run(
+                    program,
+                    feed={
+                        name: var_dict[name]
+                        for name in ['x', 'scale', 'bias', 'y@GRAD']
+                    },
+                    fetch_list=self.fetch_list,
+                )
+
+            for id, name in enumerate(self.fetch_list):
+                self.__assert_close(var_dict[name], out[id], name)
+            print("op test forward passes: ", str(place))
+            paddle.disable_static()
+
+        places = [core.CPUPlace()]
+
+        if core.is_compiled_with_cuda() and core.op_support_gpu(
+            "instance_norm"
+        ):
+            places.append(core.CUDAPlace(0))
+        for place in places:
+            test_with_place(place, self.shape)
+
+
+class TestInstanceNormOpTrainingCase1(TestInstanceNormOpTraining):
+    def init_test_case(self):
+        self.shape = [2, 3, 4, 5]
+        self.no_grad_set = {'scale@GRAD', 'bias@GRAD'}
+        self.fetch_list = ['y', 'saved_mean', 'saved_variance', 'x@GRAD']
+
+
+class TestInstanceNormOpTrainingCase2(TestInstanceNormOpTraining):
+    def init_test_case(self):
+        self.shape = [20, 50, 4, 5]
+        self.no_grad_set = {'scale@GRAD', 'bias@GRAD'}
+        self.fetch_list = ['y', 'saved_mean', 'saved_variance', 'x@GRAD']
+
+
+class TestInstanceNormOpError(unittest.TestCase):
+    def test_errors(self):
+        paddle.enable_static()
+        with program_guard(Program(), Program()):
+            # the input of instance_norm must be Variable.
+            x1 = base.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], base.CPUPlace()
+            )
+            self.assertRaises(TypeError, paddle.static.nn.instance_norm, x1)
+
+            # the input dtype of instance_norm must be float32 or float64
+            x2 = paddle.static.data(
+                name='x2', shape=[-1, 3, 4, 5, 6], dtype="int32"
+            )
+            self.assertRaises(TypeError, paddle.static.nn.instance_norm, x2)
+        paddle.disable_static()
+
+
+class TestInstanceNormOpErrorCase1(unittest.TestCase):
+    def test_errors(self):
+        paddle.enable_static()
+        with program_guard(Program(), Program()):
+            # the first dimension of input for instance_norm must between [2d, 5d]
+            x = paddle.static.data(name='x', shape=[3], dtype="float32")
+            self.assertRaises(ValueError, paddle.static.nn.instance_norm, x)
+        paddle.disable_static()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_inverse_op.py b/test/deprecated/legacy_test/test_inverse_op.py
index 22810eecee07d..54f8466bd4d02 100644
--- a/test/deprecated/legacy_test/test_inverse_op.py
+++ b/test/deprecated/legacy_test/test_inverse_op.py
@@ -35,6 +35,12 @@ def setUp(self):
 
         np.random.seed(123)
         mat = np.random.random(self.matrix_shape).astype(self.dtype)
+        if self.dtype == 'complex64' or self.dtype == 'complex128':
+            mat = (
+                np.random.random(self.matrix_shape)
+                + 1j * np.random.random(self.matrix_shape)
+            ).astype(self.dtype)
+
         inverse = np.linalg.inv(mat)
 
         self.inputs = {'Input': mat}
@@ -92,6 +98,26 @@ def config(self):
         self.python_api = paddle.tensor.math.inverse
 
 
+class TestInverseOpComplex64(TestInverseOp):
+    def config(self):
+        self.matrix_shape = [10, 10]
+        self.dtype = "complex64"
+        self.python_api = paddle.tensor.math.inverse
+
+    def test_grad(self):
+        self.check_grad(['Input'], 'Output', check_pir=True)
+
+
+class TestInverseOpComplex128(TestInverseOp):
+    def config(self):
+        self.matrix_shape = [10, 10]
+        self.dtype = "complex128"
+        self.python_api = paddle.tensor.math.inverse
+
+    def test_grad(self):
+        self.check_grad(['Input'], 'Output', check_pir=True)
+
+
 class TestInverseAPI(unittest.TestCase):
     def setUp(self):
         np.random.seed(123)
diff --git a/test/deprecated/legacy_test/test_layer_norm_op_deprecated.py b/test/deprecated/legacy_test/test_layer_norm_op_deprecated.py
new file mode 100644
index 0000000000000..a4ab6d9e116c8
--- /dev/null
+++ b/test/deprecated/legacy_test/test_layer_norm_op_deprecated.py
@@ -0,0 +1,387 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from functools import reduce
+from operator import mul
+
+import numpy as np
+from op_test import _set_use_system_allocator
+
+import paddle
+from paddle import base
+from paddle.base import core
+
+paddle.enable_static()
+
+np.random.seed(123)
+paddle.seed(123)
+
+_set_use_system_allocator(True)
+
+
+def _reference_layer_norm_naive(x, scale, beta, epsilon, begin_norm_axis=1):
+    x_shape = x.shape
+    N = reduce(mul, x_shape[0:begin_norm_axis], 1)
+    D = reduce(mul, x_shape[begin_norm_axis : len(x_shape)], 1)
+    x.shape = [N, D]
+
+    mean = np.mean(x, axis=1)
+    var = np.var(x, axis=1) + epsilon
+    output = np.divide(
+        (x - mean.reshape([N, 1])), (np.sqrt(var)).reshape([N, 1])
+    )
+    if scale is not None:
+        output = scale.reshape([1, D]) * output
+    if beta is not None:
+        output = output + beta.reshape([1, D])
+
+    x.shape, output.shape = x_shape, x_shape
+    return output, mean, var
+
+
+def _reference_layer_norm_grad(
+    x, grad_y, scale, bias, mean, var, begin_norm_axis=1
+):
+    x_shape = x.shape
+    N = reduce(mul, x_shape[0:begin_norm_axis], 1)
+    D = reduce(mul, x_shape[begin_norm_axis : len(x_shape)], 1)
+
+    if scale is not None:
+        scale_shape = scale.shape
+        scale.shape = [1, D]
+    x.shape, grad_y.shape = [N, D], [N, D]
+    var.shape, mean.shape = [N, 1], [N, 1]
+
+    # d_bias
+    if bias is not None:
+        d_bias = np.sum(grad_y, axis=0).reshape([1, D])
+    else:
+        d_bias = None
+    # d_scale
+    if scale is not None:
+        d_scale = np.sum(
+            ((x - mean) * np.sqrt(1 / var)) * grad_y, axis=0
+        ).reshape([1, D])
+    else:
+        d_scale = None
+    # dx
+    if scale is not None:
+        dx_end = scale * np.sqrt(1.0 / var) * grad_y
+        d_mean_0 = np.sum(-np.sqrt(1.0 / var) * grad_y * scale, axis=1).reshape(
+            [N, 1]
+        )  # the second part equals to zero.
+        d_mean = 1.0 / D * d_mean_0
+        d_std = np.sum(
+            -(1.0 / var) * (x - mean) * grad_y * scale, axis=1
+        ).reshape([N, 1]) * (
+            1.0 / D * np.sqrt(1.0 / var).reshape([N, 1]) * (x - mean)
+        )
+    else:
+        dx_end = 1.0 * np.sqrt(1.0 / var) * grad_y
+        d_mean_0 = np.sum(-np.sqrt(1.0 / var) * grad_y * 1.0, axis=1).reshape(
+            [N, 1]
+        )  # the second part equals to zero.
+        d_mean = 1.0 / D * d_mean_0
+        d_std = np.sum(
+            -(1.0 / var) * (x - mean) * grad_y * 1.0, axis=1
+        ).reshape([N, 1]) * (
+            1.0 / D * np.sqrt(1.0 / var).reshape([N, 1]) * (x - mean)
+        )
+
+    grad_x = dx_end + d_mean + d_std
+
+    grad_x.shape, x.shape, grad_y.shape = x_shape, x_shape, x_shape
+    var.shape, mean.shape = [N], [N]
+
+    if scale is not None:
+        scale.shape = scale_shape
+    return grad_x, d_scale, d_bias
+
+
+def layer_norm_wrapper(
+    x, scale=None, bias=None, epsilon=1e-05, begin_norm_axis=1
+):
+    input_shape = list(x.shape)
+    normalized_shape = input_shape[begin_norm_axis:]
+    return paddle.nn.functional.layer_norm(
+        x, normalized_shape, weight=scale, bias=bias, epsilon=epsilon
+    )
+
+
+class TestLayerNormOp(unittest.TestCase):
+    def setUp(self):
+        self.use_cudnn = True
+        paddle.enable_static()
+
+    def __assert_close(self, tensor, np_array, msg, atol=1e-4):
+        np.testing.assert_allclose(
+            np.array(tensor).flatten(),
+            np_array.flatten(),
+            rtol=1e-3,
+            atol=atol,
+            err_msg=msg,
+        )
+
+    def check_forward_backward(
+        self,
+        shape,
+        begin_norm_axis,
+        has_scale=True,
+        has_bias=True,
+        y_grad_scale=1.0,
+        use_mkldnn=False,
+    ):
+        def test_with_place(
+            place, shape, begin_norm_axis, use_mkldnn=use_mkldnn
+        ):
+            # attr
+            epsilon = 0.00001
+            x_shape = shape
+            D = reduce(mul, x_shape[begin_norm_axis : len(x_shape)], 1)
+            scale_shape = [D]
+
+            np.random.seed(123)
+            x = np.random.random_sample(x_shape).astype(np.float32)
+            scale = (
+                np.random.random_sample(scale_shape).astype(np.float32)
+                if has_scale
+                else None
+            )
+            bias = (
+                np.random.random_sample(scale_shape).astype(np.float32)
+                if has_bias
+                else None
+            )
+            y_grad = (np.random.random_sample(x_shape) * y_grad_scale).astype(
+                np.float32
+            )
+
+            # reference forward & backward
+            y, mean, variance = _reference_layer_norm_naive(
+                x, scale, bias, epsilon, begin_norm_axis
+            )
+            x_grad, scale_grad, bias_grad = _reference_layer_norm_grad(
+                x, y_grad, scale, bias, mean, variance, begin_norm_axis
+            )
+
+            var_dict = locals()
+            var_dict['y@GRAD'] = y_grad
+            var_names = ['x', 'mean', 'variance', 'y', 'y@GRAD']
+            if has_scale:
+                var_names += ['scale']
+            if has_bias:
+                var_names += ['bias']
+            ground_truth = {name: var_dict[name] for name in var_names}
+
+            program = base.Program()
+            with base.program_guard(program):
+                block = program.global_block()
+                for name in ground_truth:
+                    block.create_var(
+                        name=name,
+                        dtype='float32',
+                        shape=ground_truth[name].shape,
+                    )
+                inputs = {"X": block.var('x')}
+                fetch_list = [
+                    'y',
+                    'mean',
+                    'variance',
+                    'x@GRAD',
+                ]
+                if has_scale:
+                    inputs["Scale"] = block.var('scale')
+                    fetch_list += ['scale@GRAD']
+                if has_bias:
+                    inputs["Bias"] = block.var('bias')
+                    fetch_list += ['bias@GRAD']
+                layer_norm_op = block.append_op(
+                    type="layer_norm",
+                    inputs=inputs,
+                    outputs={
+                        "Y": block.var('y'),
+                        "Mean": block.var('mean'),  # share the same memory
+                        "Variance": block.var(
+                            'variance'
+                        ),  # share the same memory
+                    },
+                    attrs={
+                        "epsilon": epsilon,
+                        "begin_norm_axis": begin_norm_axis,
+                        "use_mkldnn": use_mkldnn,
+                    },
+                )
+                # generate backward op_desc
+                grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
+                    layer_norm_op.desc, set(), []
+                )
+                grad_op_desc = grad_op_desc_list[0]
+                new_op_desc = block.desc.append_op()
+                new_op_desc.copy_from(grad_op_desc)
+                for var_name in grad_op_desc.output_arg_names():
+                    block.desc.var(var_name.encode("ascii"))
+                grad_op_desc.infer_var_type(block.desc)
+                grad_op_desc.infer_shape(block.desc)
+                for arg in grad_op_desc.output_arg_names():
+                    grad_var = block.desc.find_var(arg.encode("ascii"))
+                    grad_var.set_dtype(core.VarDesc.VarType.FP32)
+
+                program._sync_with_cpp()
+                exe = base.Executor(place)
+                name_list = ['x', 'y@GRAD']
+                if has_scale:
+                    name_list += ['scale']
+                if has_bias:
+                    name_list += ['bias']
+
+                out = exe.run(
+                    program,
+                    feed={name: var_dict[name] for name in name_list},
+                    fetch_list=fetch_list,
+                )
+                # print(y)
+                # print(out[0])
+                self.__assert_close(y, out[0], "y")
+                self.__assert_close(mean, out[1], "mean")
+                self.__assert_close(variance, out[2], "variance", 1e-3)
+                self.__assert_close(x_grad, out[3], "x_grad")
+                if has_scale:
+                    self.__assert_close(
+                        scale_grad,
+                        out[fetch_list.index('scale@GRAD')],
+                        "scale_grad",
+                        1e-3,
+                    )
+                if has_bias:
+                    self.__assert_close(
+                        bias_grad,
+                        out[fetch_list.index('bias@GRAD')],
+                        "bias_grad",
+                    )
+
+        places = [core.CPUPlace()]
+        if (
+            core.is_compiled_with_cuda()
+            and core.op_support_gpu("layer_norm")
+            and self.use_cudnn
+        ):
+            places.append(core.CUDAPlace(0))
+
+        for place in places:
+            test_with_place(place, shape, begin_norm_axis)
+
+    def test_check_forward_backward_with_scale_and_bias(self):
+        self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=1)
+        self.check_forward_backward(shape=[1, 3, 4, 5], begin_norm_axis=1)
+        self.check_forward_backward(
+            shape=[2, 3, 4, 5],
+            begin_norm_axis=1,
+            has_scale=False,
+            has_bias=True,
+        )
+        self.check_forward_backward(
+            shape=[2, 3, 4, 5],
+            begin_norm_axis=1,
+            has_scale=True,
+            has_bias=False,
+        )
+        self.check_forward_backward(
+            shape=[2, 3, 4, 5],
+            begin_norm_axis=1,
+            has_scale=False,
+            has_bias=False,
+        )
+        self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=3)
+        self.check_forward_backward(
+            shape=[92, 513, 129], begin_norm_axis=2, y_grad_scale=0.1
+        )
+        self.check_forward_backward(shape=[3, 34, 1134], begin_norm_axis=2)
+        self.check_forward_backward(shape=[3, 2, 1133], begin_norm_axis=2)
+        self.check_forward_backward(
+            shape=[92, 513, 1134], begin_norm_axis=2, y_grad_scale=0.1
+        )
+        self.check_forward_backward(
+            shape=[92, 513, 1134],
+            begin_norm_axis=2,
+            has_scale=False,
+            has_bias=True,
+            y_grad_scale=0.1,
+        )
+        self.check_forward_backward(
+            shape=[92, 513, 1134],
+            begin_norm_axis=2,
+            has_scale=True,
+            has_bias=False,
+            y_grad_scale=0.1,
+        )
+        self.check_forward_backward(
+            shape=[92, 513, 1134],
+            begin_norm_axis=2,
+            has_scale=False,
+            has_bias=False,
+            y_grad_scale=0.1,
+        )
+        self.check_forward_backward(
+            shape=[512, 1024], begin_norm_axis=1, has_scale=True, has_bias=True
+        )
+        self.check_forward_backward(
+            shape=[1, 128, 256, 256],
+            begin_norm_axis=3,
+            has_scale=True,
+            has_bias=True,
+        )
+        self.check_forward_backward(
+            shape=[1, 256, 384],
+            begin_norm_axis=2,
+            has_scale=True,
+            has_bias=True,
+        )
+
+
+class TestLayerNormAPI(unittest.TestCase):
+    def test_case(self):
+        x = paddle.static.data(name='x', shape=[64, 32, 256], dtype='float32')
+        x = paddle.static.nn.layer_norm(
+            x,
+            scale=True,
+            shift=True,
+            begin_norm_axis=1,
+            epsilon=1e-05,
+            param_attr=None,
+            bias_attr=None,
+        )
+        x = paddle.static.nn.layer_norm(
+            x,
+            scale=False,
+            shift=False,
+            begin_norm_axis=1,
+            epsilon=1e-05,
+            param_attr=None,
+            bias_attr=None,
+        )
+        x = paddle.static.nn.layer_norm(
+            x,
+            scale=True,
+            shift=True,
+            begin_norm_axis=1,
+            epsilon=1e-05,
+            param_attr="scale",
+            bias_attr="shift",
+        )
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_math_op_patch.py b/test/deprecated/legacy_test/test_math_op_patch.py
index fe0708098fb72..49331d3fc0955 100644
--- a/test/deprecated/legacy_test/test_math_op_patch.py
+++ b/test/deprecated/legacy_test/test_math_op_patch.py
@@ -19,8 +19,6 @@
 
 import paddle
 from paddle import base
-from paddle.framework import in_pir_mode
-from paddle.pir_utils import test_with_pir_api
 
 
 class TestMathOpPatches(unittest.TestCase):
@@ -232,32 +230,6 @@ def test_equal(self):
         np.testing.assert_array_equal(c_np, a_np == b_np)
         self.assertEqual(c.dtype, paddle.bool)
 
-    @prog_scope()
-    @test_with_pir_api
-    def test_equal_and_cond(self):
-        a = paddle.static.data(name="a", shape=[-1, 1], dtype='float32')
-        b = paddle.static.data(name="b", shape=[-1, 1], dtype='float32')
-        if not in_pir_mode():
-            a.desc.set_need_check_feed(False)
-            b.desc.set_need_check_feed(False)
-        one = paddle.ones(shape=[1], dtype='int32')
-        zero = paddle.zeros(shape=[1], dtype='int32')
-        cond = one == zero
-        c = paddle.static.nn.cond(cond, lambda: a + b, lambda: a - b)
-
-        place = base.CPUPlace()
-        exe = base.Executor(place)
-        a_np = np.array([3, 4, 10, 14, 9, 18]).astype('float32')
-        b_np = np.array([3, 4, 11, 15, 8, 18]).astype('float32')
-
-        (c_np,) = exe.run(
-            paddle.static.default_main_program(),
-            feed={"a": a_np, "b": b_np},
-            fetch_list=[c],
-        )
-
-        np.testing.assert_array_equal(c_np, a_np - b_np)
-
     @prog_scope()
     def test_neg(self):
         a = paddle.static.data(name="a", shape=[-1, 10, 1], dtype='float32')
diff --git a/test/deprecated/legacy_test/test_math_op_patch_deprecated.py b/test/deprecated/legacy_test/test_math_op_patch_deprecated.py
new file mode 100644
index 0000000000000..982c439d99828
--- /dev/null
+++ b/test/deprecated/legacy_test/test_math_op_patch_deprecated.py
@@ -0,0 +1,60 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from decorator_helper import prog_scope
+
+import paddle
+from paddle import base
+from paddle.framework import in_pir_mode
+from paddle.pir_utils import test_with_pir_api
+
+
+class TestMathOpPatches(unittest.TestCase):
+    @classmethod
+    def setUp(self):
+        np.random.seed(1024)
+        paddle.enable_static()
+
+    @prog_scope()
+    @test_with_pir_api
+    def test_equal_and_cond(self):
+        a = paddle.static.data(name="a", shape=[-1, 1], dtype='float32')
+        b = paddle.static.data(name="b", shape=[-1, 1], dtype='float32')
+        if not in_pir_mode():
+            a.desc.set_need_check_feed(False)
+            b.desc.set_need_check_feed(False)
+        one = paddle.ones(shape=[1], dtype='int32')
+        zero = paddle.zeros(shape=[1], dtype='int32')
+        cond = one == zero
+        c = paddle.static.nn.cond(cond, lambda: a + b, lambda: a - b)
+
+        place = base.CPUPlace()
+        exe = base.Executor(place)
+        a_np = np.array([3, 4, 10, 14, 9, 18]).astype('float32')
+        b_np = np.array([3, 4, 11, 15, 8, 18]).astype('float32')
+
+        (c_np,) = exe.run(
+            paddle.static.default_main_program(),
+            feed={"a": a_np, "b": b_np},
+            fetch_list=[c],
+        )
+
+        np.testing.assert_array_equal(c_np, a_np - b_np)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_merged_momentum_op.py b/test/deprecated/legacy_test/test_merged_momentum_op_deprecated.py
similarity index 100%
rename from test/deprecated/legacy_test/test_merged_momentum_op.py
rename to test/deprecated/legacy_test/test_merged_momentum_op_deprecated.py
diff --git a/test/deprecated/legacy_test/test_momentum_op.py b/test/deprecated/legacy_test/test_momentum_op.py
index de8ba1886598e..c48601326f4bd 100644
--- a/test/deprecated/legacy_test/test_momentum_op.py
+++ b/test/deprecated/legacy_test/test_momentum_op.py
@@ -14,7 +14,6 @@
 
 import unittest
 
-import numpy
 import numpy as np
 from op import Operator
 from op_test import OpTest
@@ -1035,80 +1034,6 @@ def test_main(self):
                 self._check_with_param_group(place, use_amp)
 
 
-class TestMultiTensorMomentumStatic(unittest.TestCase):
-    def _momentum_optimize_static(
-        self, place, use_amp=False, use_multi_tensor=False
-    ):
-        paddle.enable_static()
-        paddle.seed(10)
-        np.random.seed(10)
-        if place == 'cpu':
-            use_amp = False
-        exe = paddle.static.Executor(place=place)
-        train_program = paddle.static.Program()
-        startup_program = paddle.static.Program()
-        optimizer = paddle.optimizer.Momentum(
-            multi_precision=use_amp, use_multi_tensor=use_multi_tensor
-        )
-        if use_amp:
-            optimizer = paddle.static.amp.decorate(
-                optimizer,
-                init_loss_scaling=128.0,
-                use_dynamic_loss_scaling=True,
-                use_pure_fp16=True,
-                use_fp16_guard=False,
-            )
-        with paddle.static.program_guard(train_program, startup_program):
-            if use_amp:
-                data = paddle.static.data(
-                    shape=[2, 2], name='X', dtype='float16'
-                )
-            else:
-                data = paddle.static.data(
-                    shape=[2, 2], name='X', dtype='float32'
-                )
-            hidden = paddle.static.nn.fc(x=data, size=10)
-            loss = paddle.mean(hidden)
-            optimizer.minimize(loss)
-        exe.run(startup_program)
-        if use_amp:
-            optimizer.amp_init(
-                place=paddle.CUDAPlace(0), scope=paddle.static.global_scope()
-            )
-            x = numpy.random.random(size=(2, 2)).astype('float16')
-        else:
-            x = numpy.random.random(size=(2, 2)).astype('float32')
-        out = []
-        for idx in range(5):
-            (loss_data,) = exe.run(
-                train_program, feed={"X": x}, fetch_list=[loss]
-            )
-            out.append(loss_data)
-        return out
-
-    def _get_places(self):
-        places = ['cpu']
-        if paddle.is_compiled_with_cuda():
-            places.append('gpu')
-        return places
-
-    def _check_with_place_amp(self, place, use_amp):
-        output1 = self._momentum_optimize_static(
-            place=place, use_amp=use_amp, use_multi_tensor=True
-        )
-        output2 = self._momentum_optimize_static(
-            place=place, use_amp=use_amp, use_multi_tensor=False
-        )
-        for idx in range(len(output1)):
-            np.testing.assert_allclose(output1[idx], output2[idx], rtol=1e-05)
-
-    def test_main(self):
-        for place in self._get_places():
-            use_amp_list = [True, False]
-            for use_amp in use_amp_list:
-                self._check_with_place_amp(place, use_amp)
-
-
 if __name__ == "__main__":
     paddle.enable_static()
     unittest.main()
diff --git a/test/deprecated/legacy_test/test_momentum_op_deprecated.py b/test/deprecated/legacy_test/test_momentum_op_deprecated.py
new file mode 100644
index 0000000000000..65c5e584d0c5f
--- /dev/null
+++ b/test/deprecated/legacy_test/test_momentum_op_deprecated.py
@@ -0,0 +1,157 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy
+import numpy as np
+
+import paddle
+
+
+def calculate_momentum_by_numpy(
+    param,
+    grad,
+    mu,
+    velocity,
+    use_nesterov,
+    learning_rate,
+    regularization_method=None,
+    regularization_coeff=1.0,
+):
+    if regularization_method == "l2_decay":
+        grad = grad + regularization_coeff * param
+
+        velocity_out = mu * velocity + grad
+        if use_nesterov:
+            param_out = param - (grad + velocity_out * mu) * learning_rate
+        else:
+            param_out = param - learning_rate * velocity_out
+    else:
+        velocity_out = mu * velocity + grad
+        if use_nesterov:
+            param_out = (
+                param - grad * learning_rate - velocity_out * mu * learning_rate
+            )
+        else:
+            param_out = param - learning_rate * velocity_out
+
+    return param_out, velocity_out
+
+
+def momentum_wrapper(
+    param,
+    grad,
+    velocity,
+    learning_rate=1.0,
+    master_param=None,
+    mu=0.0,
+    use_nesterov=False,
+    regularization_method="",
+    regularization_coeff=0.0,
+    multi_precision=False,
+    rescale_grad=1.0,
+):
+    return paddle._C_ops.momentum_(
+        param,
+        grad,
+        velocity,
+        learning_rate,
+        master_param,
+        mu,
+        use_nesterov,
+        regularization_method,
+        regularization_coeff,
+        multi_precision,
+        rescale_grad,
+    )
+
+
+class TestMultiTensorMomentumStatic(unittest.TestCase):
+    def _momentum_optimize_static(
+        self, place, use_amp=False, use_multi_tensor=False
+    ):
+        paddle.enable_static()
+        paddle.seed(10)
+        np.random.seed(10)
+        if place == 'cpu':
+            use_amp = False
+        exe = paddle.static.Executor(place=place)
+        train_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        optimizer = paddle.optimizer.Momentum(
+            multi_precision=use_amp, use_multi_tensor=use_multi_tensor
+        )
+        if use_amp:
+            optimizer = paddle.static.amp.decorate(
+                optimizer,
+                init_loss_scaling=128.0,
+                use_dynamic_loss_scaling=True,
+                use_pure_fp16=True,
+                use_fp16_guard=False,
+            )
+        with paddle.static.program_guard(train_program, startup_program):
+            if use_amp:
+                data = paddle.static.data(
+                    shape=[2, 2], name='X', dtype='float16'
+                )
+            else:
+                data = paddle.static.data(
+                    shape=[2, 2], name='X', dtype='float32'
+                )
+            hidden = paddle.static.nn.fc(x=data, size=10)
+            loss = paddle.mean(hidden)
+            optimizer.minimize(loss)
+        exe.run(startup_program)
+        if use_amp:
+            optimizer.amp_init(
+                place=paddle.CUDAPlace(0), scope=paddle.static.global_scope()
+            )
+            x = numpy.random.random(size=(2, 2)).astype('float16')
+        else:
+            x = numpy.random.random(size=(2, 2)).astype('float32')
+        out = []
+        for idx in range(5):
+            (loss_data,) = exe.run(
+                train_program, feed={"X": x}, fetch_list=[loss]
+            )
+            out.append(loss_data)
+        return out
+
+    def _get_places(self):
+        places = ['cpu']
+        if paddle.is_compiled_with_cuda():
+            places.append('gpu')
+        return places
+
+    def _check_with_place_amp(self, place, use_amp):
+        output1 = self._momentum_optimize_static(
+            place=place, use_amp=use_amp, use_multi_tensor=True
+        )
+        output2 = self._momentum_optimize_static(
+            place=place, use_amp=use_amp, use_multi_tensor=False
+        )
+        for idx in range(len(output1)):
+            np.testing.assert_allclose(output1[idx], output2[idx], rtol=1e-05)
+
+    def test_main(self):
+        for place in self._get_places():
+            use_amp_list = [True, False]
+            for use_amp in use_amp_list:
+                self._check_with_place_amp(place, use_amp)
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_multinomial_op.py b/test/deprecated/legacy_test/test_multinomial_op.py
index f6fc6e281193b..48c00ed5506e5 100644
--- a/test/deprecated/legacy_test/test_multinomial_op.py
+++ b/test/deprecated/legacy_test/test_multinomial_op.py
@@ -17,7 +17,7 @@
 
 import numpy as np
 from op_test import OpTest, convert_float_to_uint16
-from test_attribute_var import UnittestBase
+from test_attribute_var_deprecated import UnittestBase
 
 import paddle
 from paddle import base
diff --git a/test/deprecated/legacy_test/test_name_scope.py b/test/deprecated/legacy_test/test_name_scope_deprecated.py
similarity index 98%
rename from test/deprecated/legacy_test/test_name_scope.py
rename to test/deprecated/legacy_test/test_name_scope_deprecated.py
index 4b3e5dd0ff9df..e0822313ef27a 100644
--- a/test/deprecated/legacy_test/test_name_scope.py
+++ b/test/deprecated/legacy_test/test_name_scope_deprecated.py
@@ -17,6 +17,8 @@
 import paddle
 from paddle import base
 
+paddle.enable_static()
+
 
 class TestNameScope(unittest.TestCase):
     def test_name_scope(self):
diff --git a/test/deprecated/legacy_test/test_nce.py b/test/deprecated/legacy_test/test_nce_deprecated.py
similarity index 79%
rename from test/deprecated/legacy_test/test_nce.py
rename to test/deprecated/legacy_test/test_nce_deprecated.py
index 1091f706d1935..fbfea5a4359cd 100644
--- a/test/deprecated/legacy_test/test_nce.py
+++ b/test/deprecated/legacy_test/test_nce_deprecated.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest, paddle_static_guard
+from op_test import paddle_static_guard
 
 import paddle
 from paddle import base
@@ -66,94 +66,6 @@ def nce(
     )
 
 
-class TestNCE(OpTest):
-    def generate_data(
-        self,
-        dim,
-        batch_size,
-        num_classes,
-        num_true_class,
-        num_neg_samples,
-        is_sparse,
-    ):
-        input = np.random.randn(batch_size, dim).astype(np.float32)
-        weight = np.random.randn(num_classes, dim).astype(np.float32)
-        bias = np.random.randn(num_classes).astype(np.float32)
-        sample_weight = np.random.randn(batch_size).astype(np.float32)
-        labels = np.random.randint(
-            0, num_classes, (batch_size, num_true_class)
-        ).astype("int64")
-        self.attrs = {
-            'num_total_classes': num_classes,
-            'num_neg_samples': num_neg_samples,
-            'custom_neg_classes': list(range(num_neg_samples)),
-            'seed': 0,
-            'sampler': 0,
-            'is_sparse': is_sparse,
-            'is_test': self.is_test,
-        }
-        self.inputs = {
-            'Input': input,
-            'Label': labels,
-            'Weight': weight,
-            'Bias': bias,
-            'SampleWeight': sample_weight,
-        }
-
-    def set_is_test(self):
-        self.is_test = False
-
-    def set_data(self):
-        self.generate_data(5, 25, 100, 1, 2, False)
-
-    def compute(self):
-        out = nce(
-            self.inputs['Input'],
-            self.inputs['Weight'],
-            self.inputs['Bias'],
-            self.inputs['SampleWeight'],
-            self.inputs['Label'],
-            self.attrs['num_total_classes'],
-            self.attrs['num_neg_samples'],
-        )
-        if self.is_test:
-            self.outputs = {'Cost': out[0]}
-        else:
-            self.outputs = {
-                'Cost': out[0],
-                'SampleLogits': out[1],
-                'SampleLabels': out[2],
-            }
-
-    def setUp(self):
-        self.op_type = 'nce'
-        self.set_is_test()
-        self.set_data()
-        self.compute()
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(
-            ["Input", "Weight", "Bias"], "Cost", max_relative_error=0.02
-        )
-
-
-class TestNCECase1Tensor(TestNCE):
-    def set_data(self):
-        self.generate_data(10, 20, 100, 2, 5, False)
-
-
-class TestNCETensorIsTest(TestNCE):
-    # if is_test = True, there's no need to calculate grad
-    def set_is_test(self):
-        self.is_test = True
-
-    def test_check_grad(self):
-        pass
-
-
 class TestNCECase1SelectedRows(unittest.TestCase):
     def setUp(self):
         self.base_lr = 0.0001
diff --git a/test/deprecated/legacy_test/test_one_hot_v2_op.py b/test/deprecated/legacy_test/test_one_hot_v2_op.py
index 760d96858c3fa..b19bb8b6d2fb8 100644
--- a/test/deprecated/legacy_test/test_one_hot_v2_op.py
+++ b/test/deprecated/legacy_test/test_one_hot_v2_op.py
@@ -54,6 +54,37 @@ def test_check_output(self):
         self.check_output(check_cinn=True, check_prim_pir=True)
 
 
+class TestOneHotOp_dims(OpTest):
+    def setUp(self):
+        self.op_type = 'one_hot_v2'
+        self.prim_op_type = "comp"
+        self.python_api = one_hot_wrapper
+        self.public_python_api = one_hot_wrapper
+        self.python_out_sig = ['Out']
+        depth = 10
+        depth_np = np.array(10).astype('int32')
+        x_shape = [5, 10, 7, 3]
+        x = [np.random.randint(0, depth - 1) for i in range(np.prod(x_shape))]
+        x = np.array(x).astype('int32').reshape(x_shape)
+
+        out = np.zeros(shape=(np.prod(x.shape), depth)).astype('float32')
+
+        r_x = np.reshape(x, np.prod(x.shape))
+        for i in range(np.prod(x.shape)):
+            out[i, r_x[i]] = 1.0
+
+        shape_np = list(x.shape)
+        shape_np.append(depth)
+        out = np.reshape(out, shape_np)
+
+        self.inputs = {'X': x, 'depth_tensor': depth_np}
+        self.attrs = {'dtype': int(core.VarDesc.VarType.FP32)}
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output(check_cinn=True, check_prim_pir=True)
+
+
 class TestOneHotOp_attr(OpTest):
     def setUp(self):
         self.op_type = 'one_hot_v2'
diff --git a/test/deprecated/legacy_test/test_optimizer.py b/test/deprecated/legacy_test/test_optimizer_deprecated.py
similarity index 88%
rename from test/deprecated/legacy_test/test_optimizer.py
rename to test/deprecated/legacy_test/test_optimizer_deprecated.py
index c7e6d21124176..f87f348d456ae 100644
--- a/test/deprecated/legacy_test/test_optimizer.py
+++ b/test/deprecated/legacy_test/test_optimizer_deprecated.py
@@ -12,11 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-import tempfile
 import unittest
 
-import numpy
 import numpy as np
 
 import paddle
@@ -25,10 +22,10 @@
 from paddle.base.backward import append_backward
 from paddle.base.framework import (
     Program,
-    convert_np_dtype_to_dtype_,
     program_guard,
 )
-from paddle.io import Dataset
+
+paddle.enable_static()
 
 
 class TestOptimizer(unittest.TestCase):
@@ -1012,142 +1009,6 @@ def test_program_desc(
         )
 
 
-class TestOptimizerDtype(unittest.TestCase):
-    '''
-    The dtype of optimizer should be inferred by parameters, and the learning rate
-    is cteated with the same dtype.
-    '''
-
-    def check_with_dtype(self, dtype):
-        class MyLayer(paddle.nn.Layer):
-            def __init__(self, dtype):
-                super().__init__()
-                self._w = self.create_parameter([2, 3], dtype=dtype)
-                self._b = self.create_parameter([2, 3], dtype=dtype)
-
-            def forward(self, x):
-                return x * self._w + self._b
-
-        with paddle.base.dygraph.guard():
-            model = MyLayer(dtype)
-            x = paddle.rand([10, 2, 3], dtype=dtype)
-            loss = model(x)
-            adam = paddle.optimizer.Adam(parameters=model.parameters())
-            loss.backward()
-            adam.step()
-            self.assertEqual(adam._dtype, convert_np_dtype_to_dtype_(dtype))
-
-    def test_float64(self):
-        self.check_with_dtype('float64')
-
-    def test_float32(self):
-        self.check_with_dtype('float32')
-
-
-@unittest.skipIf(
-    not core.is_compiled_with_cuda()
-    or paddle.device.cuda.get_device_capability()[0] < 7.0,
-    "run test when gpu's compute capability is at least 7.0.",
-)
-class TestMasterWeightSaveForFP16(unittest.TestCase):
-    '''
-    For Amp-O2, some optimizer(Momentum, Adam ...) will create master weights for parameters to improve the accuracy.
-    Master weights will be saved by optimizer::state_dict.
-    '''
-
-    def setUp(self):
-        self.temp_dir = tempfile.TemporaryDirectory()
-
-    def tearDown(self):
-        self.temp_dir.cleanup()
-
-    def check_with_opt_state_dict(self, use_save_load=True):
-        paddle.seed(100)
-        numpy.random.seed(100)
-
-        class SimpleNet(paddle.nn.Layer):
-            def __init__(self, input_size, output_size):
-                super().__init__()
-                self.linears = paddle.nn.LayerList(
-                    [
-                        paddle.nn.Linear(input_size, output_size)
-                        for i in range(1)
-                    ]
-                )
-
-            def forward(self, x):
-                for i, l in enumerate(self.linears):
-                    x = self.linears[i](x)
-                return x
-
-        input_size = 2  # 设为较大的值
-        output_size = 2  # 设为较大的值
-        batch_size = 2  # batch_size 为8的倍数
-        nums_batch = 10
-
-        class RandomDataset(Dataset):
-            def __init__(self, num_samples):
-                self.num_samples = num_samples
-
-            def __getitem__(self, idx):
-                data = numpy.random.random([input_size]).astype('float16')
-                label = numpy.random.random([output_size]).astype('float16')
-                return data, label
-
-            def __len__(self):
-                return self.num_samples
-
-        dataset = RandomDataset(nums_batch * batch_size)
-        loader = paddle.io.DataLoader(
-            dataset,
-            batch_size=batch_size,
-            shuffle=False,
-            drop_last=True,
-            num_workers=0,
-        )
-
-        mse = paddle.nn.MSELoss()
-        model = SimpleNet(input_size, output_size)  # 定义模型
-        optimizer = paddle.optimizer.Momentum(
-            learning_rate=0.0001,
-            parameters=model.parameters(),
-            multi_precision=True,
-        )  # 定义优化器
-        scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
-        model = paddle.amp.decorate(models=model, level='O2')
-
-        for i, (data, label) in enumerate(loader):
-            with paddle.amp.auto_cast(level='O2'):
-                output = model(data)
-                loss = mse(output, label)
-            scaled = scaler.scale(loss)
-            scaled.backward()
-            scaler.step(optimizer)
-            scaler.update()
-            optimizer.clear_grad(set_to_zero=False)
-
-            if use_save_load and i == 5:
-                model_path = os.path.join(self.temp_dir.name, "model.pdparams")
-                optimizer_path = os.path.join(self.temp_dir.name, "opt.pdopt")
-                paddle.save(model.state_dict(), model_path)
-                paddle.save(optimizer.state_dict(), optimizer_path)
-                model.set_state_dict(paddle.load(model_path))
-                optimizer.set_state_dict(paddle.load(optimizer_path))
-
-        return loss.numpy()
-
-    def test_with_state_dict(self):
-        if core.is_compiled_with_cuda():
-            with base.dygraph.guard():
-                out_use_state_dict = self.check_with_opt_state_dict(
-                    use_save_load=True
-                )
-                out_no_state_dict = self.check_with_opt_state_dict(
-                    use_save_load=False
-                )
-            np.testing.assert_array_equal(out_use_state_dict, out_no_state_dict)
-
-
 if __name__ == '__main__':
     paddle.enable_static()
     unittest.main()
diff --git a/test/deprecated/legacy_test/test_optimizer_in_control_flow.py b/test/deprecated/legacy_test/test_optimizer_in_control_flow_deprecated.py
similarity index 100%
rename from test/deprecated/legacy_test/test_optimizer_in_control_flow.py
rename to test/deprecated/legacy_test/test_optimizer_in_control_flow_deprecated.py
diff --git a/test/deprecated/legacy_test/test_prelu_op_deprecated.py b/test/deprecated/legacy_test/test_prelu_op_deprecated.py
new file mode 100644
index 0000000000000..f329a58ecd15f
--- /dev/null
+++ b/test/deprecated/legacy_test/test_prelu_op_deprecated.py
@@ -0,0 +1,86 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle import base
+from paddle.base import Program, core
+
+paddle.enable_static()
+
+
+def prelu_t(x, mode, param_attr=None, name=None, data_format='NCHW'):
+    helper = base.layer_helper.LayerHelper('prelu', **locals())
+    alpha_shape = [1, x.shape[1], 1, 1]
+    dtype = helper.input_dtype(input_param_name='x')
+    alpha = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=alpha_shape,
+        dtype='float32',
+        is_bias=False,
+        default_initializer=paddle.nn.initializer.Constant(0.25),
+    )
+    out = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(
+        type="prelu",
+        inputs={"X": x, 'Alpha': alpha},
+        attrs={"mode": mode, 'data_format': data_format},
+        outputs={"Out": out},
+    )
+    return out
+
+
+# error message test if mode is not one of 'all', 'channel', 'element'
+class TestModeError(unittest.TestCase):
+    def setUp(self):
+        self.place = (
+            paddle.CUDAPlace(0)
+            if core.is_compiled_with_cuda()
+            else paddle.CPUPlace()
+        )
+        self.x_np = np.ones([1, 2, 3, 4]).astype('float32')
+
+    def test_mode_error(self):
+        main_program = Program()
+        with base.program_guard(main_program, Program()):
+            x = paddle.static.data(name='x', shape=[2, 3, 4, 5])
+            try:
+                y = prelu_t(x, 'any')
+            except Exception as e:
+                assert e.args[0].find('InvalidArgument') != -1
+
+    def test_data_format_error1(self):
+        main_program = Program()
+        with base.program_guard(main_program, Program()):
+            x = paddle.static.data(name='x', shape=[2, 3, 4, 5])
+            try:
+                y = prelu_t(x, 'channel', data_format='N')
+            except Exception as e:
+                assert e.args[0].find('InvalidArgument') != -1
+
+    def test_data_format_error2(self):
+        main_program = Program()
+        with base.program_guard(main_program, Program()):
+            x = paddle.static.data(name='x', shape=[2, 3, 4, 5])
+            try:
+                y = paddle.static.nn.prelu(x, 'channel', data_format='N')
+            except ValueError as e:
+                pass
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_program_code.py b/test/deprecated/legacy_test/test_program_code_deprecated.py
similarity index 100%
rename from test/deprecated/legacy_test/test_program_code.py
rename to test/deprecated/legacy_test/test_program_code_deprecated.py
diff --git a/test/deprecated/legacy_test/test_program.py b/test/deprecated/legacy_test/test_program_deprecated.py
similarity index 100%
rename from test/deprecated/legacy_test/test_program.py
rename to test/deprecated/legacy_test/test_program_deprecated.py
diff --git a/test/deprecated/legacy_test/test_program_prune_backward.py b/test/deprecated/legacy_test/test_program_prune_backward_deprecated.py
similarity index 99%
rename from test/deprecated/legacy_test/test_program_prune_backward.py
rename to test/deprecated/legacy_test/test_program_prune_backward_deprecated.py
index 8bf154f934b96..dcd514b471415 100755
--- a/test/deprecated/legacy_test/test_program_prune_backward.py
+++ b/test/deprecated/legacy_test/test_program_prune_backward_deprecated.py
@@ -26,6 +26,8 @@
 from paddle.base import core
 from paddle.dataset import wmt16
 
+paddle.enable_static()
+
 DeviceType = core.DeviceType
 
 
diff --git a/test/deprecated/legacy_test/test_program_to_string.py b/test/deprecated/legacy_test/test_program_to_string_deprecated.py
similarity index 98%
rename from test/deprecated/legacy_test/test_program_to_string.py
rename to test/deprecated/legacy_test/test_program_to_string_deprecated.py
index c6524d9cf5d92..52768d4600785 100644
--- a/test/deprecated/legacy_test/test_program_to_string.py
+++ b/test/deprecated/legacy_test/test_program_to_string_deprecated.py
@@ -17,6 +17,8 @@
 import paddle
 from paddle import base
 
+paddle.enable_static()
+
 
 class TestProgram(unittest.TestCase):
     def test_program_to_string(self):
diff --git a/test/deprecated/legacy_test/test_prune.py b/test/deprecated/legacy_test/test_prune_deprecated.py
similarity index 99%
rename from test/deprecated/legacy_test/test_prune.py
rename to test/deprecated/legacy_test/test_prune_deprecated.py
index f82a4d4331b09..47f0d3d749701 100644
--- a/test/deprecated/legacy_test/test_prune.py
+++ b/test/deprecated/legacy_test/test_prune_deprecated.py
@@ -21,6 +21,8 @@
 from paddle import base
 from paddle.base import framework
 
+paddle.enable_static()
+
 
 class TestPruneBase(unittest.TestCase):
     def run_net(self, net):
diff --git a/test/deprecated/legacy_test/test_py_func_op.py b/test/deprecated/legacy_test/test_py_func_op_deprecated.py
similarity index 99%
rename from test/deprecated/legacy_test/test_py_func_op.py
rename to test/deprecated/legacy_test/test_py_func_op_deprecated.py
index 3fa249935406f..89ad64aa7d4ab 100644
--- a/test/deprecated/legacy_test/test_py_func_op.py
+++ b/test/deprecated/legacy_test/test_py_func_op_deprecated.py
@@ -20,6 +20,8 @@
 import paddle
 from paddle import base
 
+paddle.enable_static()
+
 dev_cnt = 2
 if base.core.is_compiled_with_cuda():
     dev_cnt = base.core.get_cuda_device_count()
diff --git a/test/deprecated/legacy_test/test_random_seed_deprecated.py b/test/deprecated/legacy_test/test_random_seed_deprecated.py
new file mode 100644
index 0000000000000..ee1dd64b81ee3
--- /dev/null
+++ b/test/deprecated/legacy_test/test_random_seed_deprecated.py
@@ -0,0 +1,82 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Test cloud role maker."""
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle import base
+from paddle.base import core
+
+
+class TestGeneratorSeed(unittest.TestCase):
+    #     """
+    #     Test cases for cpu generator seed.
+    #     """
+    def test_gen_TruncatedNormal_initializer(self):
+        base.disable_dygraph()
+
+        gen = paddle.seed(123123143)
+        cur_state = gen.get_state()
+
+        startup_program = base.Program()
+        train_program = base.Program()
+        with base.program_guard(train_program, startup_program):
+            # example 1:
+            # attr shape is a list which doesn't contain tensor Variable.
+            x = paddle.uniform(shape=[2, 10])
+            result_1 = paddle.static.nn.fc(
+                x,
+                size=10,
+                weight_attr=paddle.nn.initializer.TruncatedNormal(
+                    mean=0.0, std=2.0
+                ),
+            )
+            result_2 = paddle.static.nn.fc(
+                x,
+                size=10,
+                weight_attr=paddle.nn.initializer.TruncatedNormal(
+                    mean=0.0, std=2.0
+                ),
+            )
+
+            exe = base.Executor(base.CPUPlace())
+            exe.run(startup_program)
+            out1 = exe.run(
+                train_program, feed={}, fetch_list=[result_1, result_2]
+            )
+
+        gen.manual_seed(123123143)
+        with base.program_guard(train_program, startup_program):
+            exe.run(startup_program)
+            out2 = exe.run(
+                train_program, feed={}, fetch_list=[result_1, result_2]
+            )
+
+        out1_res1 = np.array(out1[0])
+        out1_res2 = np.array(out1[1])
+        out2_res1 = np.array(out2[0])
+        out2_res2 = np.array(out2[1])
+
+        if not core.is_compiled_with_cuda():
+            print(">>>>>>> sampling id static >>>>>>>")
+            np.testing.assert_allclose(out1_res1, out2_res1, rtol=1e-05)
+            np.testing.assert_allclose(out1_res2, out2_res2, rtol=1e-05)
+            self.assertTrue(not np.allclose(out1_res2, out1_res1))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_regularizer_api_deprecated.py b/test/deprecated/legacy_test/test_regularizer_api_deprecated.py
new file mode 100644
index 0000000000000..f42e07d3ae0cc
--- /dev/null
+++ b/test/deprecated/legacy_test/test_regularizer_api_deprecated.py
@@ -0,0 +1,180 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+import random
+import unittest
+from functools import partial
+
+import numpy as np
+
+import paddle
+from paddle import base
+from paddle.base import core
+
+
+def bow_net(
+    data,
+    label,
+    dict_dim,
+    is_sparse=False,
+    emb_dim=8,
+    hid_dim=8,
+    hid_dim2=6,
+    class_dim=2,
+):
+    """
+    BOW net
+    This model is from https://github.com/PaddlePaddle/models:
+    base/PaddleNLP/text_classification/nets.py
+    """
+    emb = paddle.static.nn.embedding(
+        input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim]
+    )
+    bow = paddle.static.nn.sequence_lod.sequence_pool(
+        input=emb, pool_type='sum'
+    )
+    bow_tanh = paddle.tanh(bow)
+    fc_1 = paddle.static.nn.fc(x=bow_tanh, size=hid_dim, activation="tanh")
+    fc_2 = paddle.static.nn.fc(x=fc_1, size=hid_dim2, activation="tanh")
+    prediction = paddle.static.nn.fc(
+        x=[fc_2], size=class_dim, activation="softmax"
+    )
+    cost = paddle.nn.functional.cross_entropy(
+        input=prediction, label=label, reduction='none', use_softmax=False
+    )
+    avg_cost = paddle.mean(x=cost)
+
+    return avg_cost
+
+
+class TestRegularizer(unittest.TestCase):
+    def setUp(self):
+        self.word_len = 1500
+        self.train_data = [
+            [(random.sample(range(1000), 10), [0])] for _ in range(2)
+        ]
+
+    def get_places(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        return places
+
+    @contextlib.contextmanager
+    def scope_prog_guard(self, main_prog, startup_prog):
+        scope = base.core.Scope()
+        with base.unique_name.guard():
+            with base.scope_guard(scope):
+                with base.program_guard(main_prog, startup_prog):
+                    yield
+
+    def run_program(self, place, feed_list):
+        exe = base.Executor(place)
+        feeder = base.DataFeeder(feed_list=feed_list, place=place)
+        exe.run(base.default_startup_program())
+
+        main_prog = base.default_main_program()
+        param_list = [var.name for var in main_prog.block(0).all_parameters()]
+
+        param_sum = []
+        for data in self.train_data:
+            out = exe.run(
+                main_prog, feed=feeder.feed(data), fetch_list=param_list
+            )
+            p_sum = 0
+            for v in out:
+                p_sum += np.sum(np.abs(v))
+            param_sum.append(p_sum)
+        return param_sum
+
+    def check_l2decay_regularizer(self, place, model):
+        paddle.seed(1)
+        paddle.framework.random._manual_program_seed(1)
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        with self.scope_prog_guard(
+            main_prog=main_prog, startup_prog=startup_prog
+        ):
+            data = paddle.static.data(
+                name="words", shape=[-1, 1], dtype="int64", lod_level=1
+            )
+            label = paddle.static.data(
+                name="label", shape=[-1, 1], dtype="int64"
+            )
+
+            avg_cost = model(data, label, self.word_len)
+
+            optimizer = paddle.optimizer.Adagrad(
+                learning_rate=0.1,
+                weight_decay=paddle.regularizer.L2Decay(1.0),
+            )
+            optimizer.minimize(avg_cost)
+            param_sum = self.run_program(place, [data, label])
+        return param_sum
+
+    def check_l2decay(self, place, model):
+        paddle.seed(1)
+        paddle.framework.random._manual_program_seed(1)
+        main_prog = base.framework.Program()
+        startup_prog = base.framework.Program()
+
+        with self.scope_prog_guard(
+            main_prog=main_prog, startup_prog=startup_prog
+        ):
+            data = paddle.static.data(
+                name="words", shape=[-1, 1], dtype="int64", lod_level=1
+            )
+            label = paddle.static.data(
+                name="label", shape=[-1, 1], dtype="int64"
+            )
+
+            avg_cost_l2 = model(data, label, self.word_len)
+
+            param_list = base.default_main_program().block(0).all_parameters()
+            para_sum = []
+            for para in param_list:
+                para_mul = paddle.square(x=para)
+                para_sum.append(paddle.sum(para_mul))
+            avg_cost_l2 += paddle.add_n(para_sum) * 0.5
+
+            optimizer = paddle.optimizer.Adagrad(learning_rate=0.1)
+            optimizer.minimize(avg_cost_l2)
+            param_sum = self.run_program(place, [data, label])
+        return param_sum
+
+    def test_l2(self):
+        paddle.enable_static()
+        for place in self.get_places():
+            dense_sparse_p_sum = []
+            for sparse in [True, False]:
+                model = partial(bow_net, is_sparse=sparse)
+                framework_l2 = self.check_l2decay_regularizer(place, model)
+                l2 = self.check_l2decay(place, model)
+                assert len(l2) == len(framework_l2)
+                for i in range(len(l2)):
+                    assert np.isclose(a=framework_l2[i], b=l2[i], rtol=5e-5)
+                dense_sparse_p_sum.append(framework_l2)
+
+            assert len(dense_sparse_p_sum[0]) == len(dense_sparse_p_sum[1])
+            for i in range(len(dense_sparse_p_sum[0])):
+                assert np.isclose(
+                    a=dense_sparse_p_sum[0][i],
+                    b=dense_sparse_p_sum[1][i],
+                    rtol=5e-5,
+                )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_regularizer.py b/test/deprecated/legacy_test/test_regularizer_deprecated.py
similarity index 70%
rename from test/deprecated/legacy_test/test_regularizer.py
rename to test/deprecated/legacy_test/test_regularizer_deprecated.py
index 8a3cd3da9a049..1727fe8b4f5d0 100644
--- a/test/deprecated/legacy_test/test_regularizer.py
+++ b/test/deprecated/legacy_test/test_regularizer_deprecated.py
@@ -23,7 +23,6 @@
 from paddle import base, regularizer
 from paddle.base import core, framework
 from paddle.base.backward import append_backward
-from paddle.pir_utils import test_with_pir_api
 
 
 class TestL2Decay(unittest.TestCase):
@@ -112,40 +111,6 @@ def test_l2decay_regularizer(self):
         self.assertEqual(block.ops[-2].type, 'scale')
         self.assertEqual(block.ops[-3].type, 'sign')
 
-    def test_l1decay_regularizer(self):
-        with paddle.pir_utils.IrGuard():
-            main_program = paddle.static.Program()
-            with paddle.static.program_guard(main_program):
-                block = main_program.global_block()
-                mul_x = paddle.pir.core.create_parameter(
-                    dtype="float32",
-                    shape=[5, 10],
-                    name="mul.x",
-                    regularizer=regularizer.L1Decay(0.5),
-                    initializer=paddle.nn.initializer.Constant(1),
-                )
-                self.assertIsNotNone(mul_x.regularizer)
-                self.assertTrue(
-                    isinstance(mul_x.regularizer, regularizer.L1Decay)
-                )
-
-                mul_y = paddle.static.data(
-                    dtype="float32", shape=[10, 8], name="mul.y"
-                )
-                mul_out = paddle.matmul(mul_x, mul_y)
-                mean_out = paddle.mean(mul_out)
-                grads = paddle.autograd.ir_backward.grad(mean_out, [mul_x])
-                params_grads = [(mul_x, grads[0])]
-                self.assertEqual(len(params_grads), 1)
-                count_ops = len(block.ops)
-                optimizer = paddle.optimizer.Adam()
-                params_grads = optimizer.append_regularization_ops(params_grads)
-                self.assertEqual(len(params_grads), 1)
-                self.assertEqual(len(block.ops), count_ops + 5)
-                self.assertEqual(block.ops[-1].name(), 'pd_op.add_n')
-                self.assertEqual(block.ops[-3].name(), 'pd_op.scale')
-                self.assertEqual(block.ops[-5].name(), 'pd_op.sign')
-
 
 def bow_net(
     data,
@@ -296,63 +261,6 @@ def test_l2(self):
                     rtol=5e-5,
                 )
 
-    @test_with_pir_api
-    def test_repeated_regularization(self):
-        l1 = paddle.regularizer.L1Decay(coeff=0.1)
-        l2 = paddle.regularizer.L2Decay(coeff=0.01)
-        fc_param_attr = paddle.ParamAttr(
-            regularizer=paddle.regularizer.L1Decay()
-        )
-        with paddle.static.program_guard(
-            paddle.static.Program(), paddle.static.Program()
-        ):
-            x = paddle.uniform([2, 2, 3])
-            linear = paddle.nn.Linear(3, 5, weight_attr=fc_param_attr)
-            out = linear(x)
-            loss = paddle.sum(out)
-            sgd = paddle.optimizer.SGD(learning_rate=0.1, weight_decay=l2)
-            sgd.minimize(loss)
-        with base.dygraph.guard():
-            input = paddle.to_tensor(np.random.randn(3, 2).astype('float32'))
-            paddle.seed(1)
-            paddle.framework.random._manual_program_seed(1)
-
-            linear1 = paddle.nn.Linear(
-                2, 2, weight_attr=fc_param_attr, bias_attr=fc_param_attr
-            )
-            linear2 = paddle.nn.Linear(
-                2, 2, weight_attr=fc_param_attr, bias_attr=fc_param_attr
-            )
-
-            loss1 = linear1(input)
-            loss1.backward()
-            # set l2 regularizer in optimizer, but l1 in base.ParamAttr
-
-            paddle.optimizer.SGD(
-                parameters=linear1.parameters(),
-                learning_rate=1e-2,
-                weight_decay=l2,
-            ).minimize(loss1)
-            # only set l1 in base.ParamAttr
-            loss2 = linear2(input)
-            loss2.backward()
-            paddle.optimizer.SGD(
-                parameters=linear2.parameters(), learning_rate=1e-2
-            ).minimize(loss2)
-            # they should both be applied by l1, and keep the same
-            np.testing.assert_allclose(
-                linear1.weight.numpy(),
-                linear2.weight.numpy(),
-                rtol=1e-05,
-                err_msg='weight should use the regularization in base.ParamAttr!',
-            )
-            np.testing.assert_allclose(
-                linear1.bias.numpy(),
-                linear2.bias.numpy(),
-                rtol=1e-05,
-                err_msg='bias should use the regularization in base.ParamAttr!',
-            )
-
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/deprecated/legacy_test/test_run_program_op.py b/test/deprecated/legacy_test/test_run_program_op_deprecated.py
similarity index 100%
rename from test/deprecated/legacy_test/test_run_program_op.py
rename to test/deprecated/legacy_test/test_run_program_op_deprecated.py
diff --git a/test/deprecated/legacy_test/test_select_input_output_op.py b/test/deprecated/legacy_test/test_select_input_output_op_deprecated.py
similarity index 100%
rename from test/deprecated/legacy_test/test_select_input_output_op.py
rename to test/deprecated/legacy_test/test_select_input_output_op_deprecated.py
diff --git a/test/deprecated/legacy_test/test_sgd_op_deprecated.py b/test/deprecated/legacy_test/test_sgd_op_deprecated.py
new file mode 100644
index 0000000000000..11d899f755526
--- /dev/null
+++ b/test/deprecated/legacy_test/test_sgd_op_deprecated.py
@@ -0,0 +1,214 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle import base
+
+paddle.enable_static()
+
+
+def sgd_wrapper(
+    param, learning_rate, grad, master_param=None, multi_precision=False
+):
+    paddle._C_ops.sgd_(
+        param, learning_rate, grad, master_param, multi_precision
+    )
+
+
+class TestSGDOpWithLargeInput(unittest.TestCase):
+    def runTest(self):
+        paddle.enable_static()
+        data = paddle.tensor.fill_constant(shape=[1], value=128, dtype='int64')
+        label = paddle.tensor.fill_constant(
+            shape=[1, 150], value=0.5, dtype='float32'
+        )
+        emb = paddle.static.nn.embedding(
+            input=data, size=(10000000, 150), dtype='float32'
+        )
+        out = paddle.nn.functional.normalize(x=emb, axis=-1)
+
+        cost = paddle.nn.functional.square_error_cost(input=out, label=label)
+        avg_cost = paddle.mean(cost)
+        sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.001)
+        sgd_optimizer.minimize(avg_cost)
+
+        place = base.CPUPlace()
+        exe = base.Executor(place)
+        exe.run(base.default_startup_program())
+        compiled_prog = base.compiler.CompiledProgram(
+            base.default_main_program()
+        )
+        result = exe.run(compiled_prog, fetch_list=[avg_cost])
+
+
+class TestSGDV2(unittest.TestCase):
+    def test_sgd(self):
+        paddle.enable_static()
+
+        def check_sgd_optimizer(optimizer_attr):
+            init_program = paddle.static.Program()
+            program = paddle.static.Program()
+            block = program.global_block()
+            mul_x = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="mul.x",
+                optimize_attr=optimizer_attr,
+            )
+            mul_y = block.create_var(
+                dtype="float32", shape=[10, 8], lod_level=0, name="mul.y"
+            )
+            mul_out = block.create_var(
+                dtype="float32", shape=[5, 8], lod_level=0, name="mul.out"
+            )
+            mean_out = block.create_var(
+                dtype="float32", shape=[1], lod_level=0, name="mean.out"
+            )
+            block.append_op(
+                type="mul",
+                inputs={"X": mul_x, "Y": mul_y},
+                outputs={"Out": mul_out},
+                attrs={"x_num_col_dims": 1},
+            )
+            block.append_op(
+                type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out}
+            )
+            sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+            opts, _ = sgd_optimizer.minimize(mean_out, init_program)
+            return opts
+
+        opts = check_sgd_optimizer({'learning_rate': 1.1})
+        self.assertEqual(len(opts), 2)
+        self.assertEqual([op.type for op in opts], ["scale", "sgd"])
+
+        opts = check_sgd_optimizer({'learning_rate': 1.0})
+        self.assertEqual(len(opts), 1)
+        self.assertEqual([op.type for op in opts], ["sgd"])
+
+
+class TestSGDMultiPrecision2_0(unittest.TestCase):
+    def dygraph_sgd_mp(self, mp):
+        paddle.disable_static()
+        paddle.seed(10)
+        paddle.set_device('gpu')
+        input = paddle.randn((2, 2))
+        model = paddle.nn.Linear(2, 2)
+        optimizer = paddle.optimizer.SGD(
+            parameters=model.parameters(), multi_precision=mp
+        )
+        if mp:
+            model = paddle.amp.decorate(models=model, level='O2')
+            scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
+
+        for idx in range(5):
+            if mp:
+                with paddle.amp.auto_cast(level='O2'):
+                    output = model(input)
+                    loss = paddle.mean(output)
+                scaled = scaler.scale(loss)
+                scaled.backward()
+                scaler.minimize(optimizer, scaled)
+                optimizer.clear_grad()
+            else:
+                output = model(input)
+                loss = paddle.mean(output)
+                optimizer.step()
+                optimizer.clear_grad()
+
+        return output, model.parameters()
+
+    def static_sgd_mp(self, mp):
+        paddle.enable_static()
+        paddle.seed(10)
+        np.random.seed(10)
+        exe = paddle.static.Executor('gpu')
+        train_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        optimizer = paddle.optimizer.SGD(multi_precision=mp)
+
+        if mp:
+            optimizer = paddle.static.amp.decorate(
+                optimizer,
+                init_loss_scaling=128.0,
+                use_dynamic_loss_scaling=True,
+                use_pure_fp16=True,
+                use_fp16_guard=False,
+            )
+        with paddle.static.program_guard(train_program, startup_program):
+            if mp:
+                data = paddle.static.data(
+                    shape=[2, 2], name='X', dtype='float16'
+                )
+            else:
+                data = paddle.static.data(
+                    shape=[2, 2], name='X', dtype='float32'
+                )
+            hidden = paddle.static.nn.fc(x=data, size=10)
+            loss = paddle.mean(hidden)
+            optimizer.minimize(loss)
+        exe.run(startup_program)
+
+        if mp:
+            optimizer.amp_init(
+                place=paddle.CUDAPlace(0), scope=paddle.static.global_scope()
+            )
+            x = np.random.random(size=(2, 2)).astype('float16')
+        else:
+            x = np.random.random(size=(2, 2)).astype('float32')
+        out = []
+        for idx in range(5):
+            (loss_data,) = exe.run(
+                train_program, feed={"X": x}, fetch_list=[loss]
+            )
+            out.append(loss_data)
+        return out
+
+    def test_main(self):
+        if not paddle.is_compiled_with_cuda():
+            return
+        "Test dygraph mode"
+        output1_dy, params1_dy = self.dygraph_sgd_mp(mp=True)
+        output2_dy, params2_dy = self.dygraph_sgd_mp(mp=False)
+        np.testing.assert_allclose(
+            output1_dy.astype('float32').numpy(),
+            output2_dy.astype('float32').numpy(),
+            rtol=1e-05,
+            atol=0.1,
+        )
+        for idx in range(len(params1_dy)):
+            np.testing.assert_allclose(
+                params1_dy[idx].astype('float32').numpy(),
+                params2_dy[idx].astype('float32').numpy(),
+                rtol=1e-05,
+                atol=0.1,
+            )
+        "Test static graph mode"
+        output1_st = self.static_sgd_mp(mp=True)
+        output2_st = self.static_sgd_mp(mp=False)
+        for idx in range(len(output1_st)):
+            np.testing.assert_allclose(
+                output1_st[idx].astype('float32'),
+                output2_st[idx].astype('float32'),
+                rtol=1e-05,
+                atol=0.1,
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_slice_op.py b/test/deprecated/legacy_test/test_slice_op.py
index 3a91882cac51e..1040eb676952a 100644
--- a/test/deprecated/legacy_test/test_slice_op.py
+++ b/test/deprecated/legacy_test/test_slice_op.py
@@ -1011,15 +1011,6 @@ def test_float_in_index():
 
 
 class TestInferShape(unittest.TestCase):
-    def test(self):
-        with paddle_static_guard():
-            x = paddle.ones(shape=[3, 4, 5])
-            x.desc.set_shape([3, -1, 5])
-            self.assertEqual(x.shape, (3, -1, 5))
-
-            out0 = paddle.slice(x, axes=[1], starts=[0], ends=[3])
-            self.assertEqual(out0.shape, (3, -1, 5))
-
     def test_pir(self):
         with paddle.pir_utils.IrGuard():
             x = paddle.static.data('x', shape=[3, -1, 5])
diff --git a/test/deprecated/legacy_test/test_slice_op_deprecated.py b/test/deprecated/legacy_test/test_slice_op_deprecated.py
new file mode 100644
index 0000000000000..a9ba98f3dba72
--- /dev/null
+++ b/test/deprecated/legacy_test/test_slice_op_deprecated.py
@@ -0,0 +1,37 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from op_test import paddle_static_guard
+
+import paddle
+
+paddle.enable_static()
+
+
+class TestInferShape(unittest.TestCase):
+    def test(self):
+        with paddle_static_guard():
+            x = paddle.ones(shape=[3, 4, 5])
+            x.desc.set_shape([3, -1, 5])
+            self.assertEqual(x.shape, (3, -1, 5))
+
+            out0 = paddle.slice(x, axes=[1], starts=[0], ends=[3])
+            self.assertEqual(out0.shape, (3, -1, 5))
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_squared_l2_norm_op.py b/test/deprecated/legacy_test/test_squared_l2_norm_op.py
deleted file mode 100755
index df36c81097051..0000000000000
--- a/test/deprecated/legacy_test/test_squared_l2_norm_op.py
+++ /dev/null
@@ -1,148 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from numpy import linalg as LA
-from op_test import OpTest
-
-import paddle
-import paddle.distributed as dist
-from paddle import _C_ops, _legacy_C_ops
-from paddle.framework import in_dynamic_mode
-
-
-def test_squared_l2_norm(x):
-    if in_dynamic_mode():
-        return _C_ops.squared_l2_norm(x)
-    else:
-        return _legacy_C_ops.squared_l2_norm(x)
-
-
-class TestSquaredL2NormF16Op(unittest.TestCase):
-    def init_test_case(self):
-        X = np.random.uniform(-0.1, 0.1, (8, 5, 10)).astype('float32')
-        return X
-
-    def check_main(self, x_np, dtype):
-        paddle.disable_static()
-        x = paddle.to_tensor(x_np)
-
-        x.stop_gradient = False
-        y = test_squared_l2_norm(x)
-        x_g = paddle.grad(y, [x])
-
-        paddle.enable_static()
-        return y, x_g
-
-    def test_main(self):
-        x_np = self.init_test_case()
-        y_np_1, x_g_np_1 = self.check_main(x_np, 'float32')
-        y_np_2, x_g_np_2 = self.check_main(x_np, 'float16')
-
-        def assert_equal(x, y):
-            np.testing.assert_allclose(x, y, rtol=1e-05, atol=0.0)
-
-        assert_equal(y_np_1, y_np_2)
-        assert_equal(x_g_np_1, x_g_np_2)
-
-
-class TestSquaredL2NormF16Op1(TestSquaredL2NormF16Op):
-    def init_test_case(self):
-        X = np.random.uniform(-2.0, 2.0, (30, 10)).astype('float32')
-        return X
-
-
-class TestSquaredL2NormF16Op2(TestSquaredL2NormF16Op):
-    def init_test_case(self):
-        X = np.random.uniform(-5.0, 5.0, (20, 10, 20)).astype('float32')
-        return X
-
-
-class TestL2LossOp(OpTest):
-    """Test squared_l2_norm"""
-
-    def config(self):
-        self.x_shape = (13, 19)
-        self.check_auto_parallel = False
-
-    def setUp(self):
-        self.config()
-        self.python_api = test_squared_l2_norm
-        self.op_type = "squared_l2_norm"
-        self.max_relative_error = 0.05
-
-        X = np.random.uniform(-1, 1, self.x_shape).astype("float32")
-        X[np.abs(X) < self.max_relative_error] = 0.1
-        self.inputs = {'X': X}
-        self.outputs = {'Out': np.array([np.square(LA.norm(X))])}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(
-            ['X'],
-            'Out',
-            max_relative_error=self.max_relative_error,
-            check_auto_parallel=self.check_auto_parallel,
-        )
-
-
-class TestSquaredL2NormAutoParallel_1(TestL2LossOp):
-    def config(self):
-        self.x_shape = (14, 18)
-        self.check_auto_parallel = True
-        self.placements = {
-            'X': [dist.Replicate()],
-        }
-
-
-class TestSquaredL2NormAutoParallel_2(TestL2LossOp):
-    def config(self):
-        self.x_shape = (14, 18)
-        self.check_auto_parallel = True
-        self.placements = {
-            'X': [dist.Shard(0)],
-        }
-
-
-class TestSquaredL2NormAutoParallel_3(TestL2LossOp):
-    def config(self):
-        self.x_shape = (14, 18)
-        self.check_auto_parallel = True
-        self.placements = {
-            'X': [dist.Shard(1)],
-        }
-
-
-class TestL2LossDeterministic(unittest.TestCase):
-    def check_place(self, place):
-        with paddle.base.dygraph.guard(place):
-            x_np = np.random.rand(5, 11, 13).astype('float32')
-            x = paddle.to_tensor(x_np)
-            y1 = _legacy_C_ops.squared_l2_norm(x)
-            y2 = _legacy_C_ops.squared_l2_norm(x)
-            np.testing.assert_array_equal(y1.numpy(), y2.numpy())
-
-    def test_main(self):
-        self.check_place(paddle.CPUPlace())
-        if paddle.is_compiled_with_cuda():
-            self.check_place(paddle.CUDAPlace(0))
-
-
-if __name__ == "__main__":
-    paddle.enable_static()
-    unittest.main()
diff --git a/test/deprecated/legacy_test/test_squeeze2_op_rename.py b/test/deprecated/legacy_test/test_squeeze2_op_rename.py
index ed347eda7350b..02e63c0cb2459 100644
--- a/test/deprecated/legacy_test/test_squeeze2_op_rename.py
+++ b/test/deprecated/legacy_test/test_squeeze2_op_rename.py
@@ -15,7 +15,7 @@
 import os
 import unittest
 
-from test_attribute_var import UnittestBase
+from test_attribute_var_deprecated import UnittestBase
 
 import paddle
 from paddle.base.framework import Program, program_guard
diff --git a/test/deprecated/legacy_test/test_static_pylayer_block.py b/test/deprecated/legacy_test/test_static_pylayer_block_deprecated.py
similarity index 100%
rename from test/deprecated/legacy_test/test_static_pylayer_block.py
rename to test/deprecated/legacy_test/test_static_pylayer_block_deprecated.py
diff --git a/test/deprecated/legacy_test/test_static_pylayer.py b/test/deprecated/legacy_test/test_static_pylayer_deprecated.py
similarity index 99%
rename from test/deprecated/legacy_test/test_static_pylayer.py
rename to test/deprecated/legacy_test/test_static_pylayer_deprecated.py
index ec0e655d3b6a9..e15ba4ee363be 100644
--- a/test/deprecated/legacy_test/test_static_pylayer.py
+++ b/test/deprecated/legacy_test/test_static_pylayer_deprecated.py
@@ -13,10 +13,15 @@
 # limitations under the License.
 
 import functools
+import sys
 import unittest
 
+sys.path.append(".")
 import numpy as np
-from test_prune import TestExecutorRunAutoPrune, TestPruneBase
+from test_prune_deprecated import (
+    TestExecutorRunAutoPrune,
+    TestPruneBase,
+)
 
 import paddle
 from paddle import base
diff --git a/test/deprecated/legacy_test/test_switch.py b/test/deprecated/legacy_test/test_switch_deprecated.py
similarity index 99%
rename from test/deprecated/legacy_test/test_switch.py
rename to test/deprecated/legacy_test/test_switch_deprecated.py
index 3c90ba5260542..d8b2e2fd061ad 100644
--- a/test/deprecated/legacy_test/test_switch.py
+++ b/test/deprecated/legacy_test/test_switch_deprecated.py
@@ -19,6 +19,8 @@
 from paddle.base.executor import Executor
 from paddle.base.framework import default_startup_program
 
+paddle.enable_static()
+
 
 class TestSwitch(unittest.TestCase):
     def check_switch(self, value):
diff --git a/test/deprecated/legacy_test/test_tensor_array_to_tensor.py b/test/deprecated/legacy_test/test_tensor_array_to_tensor.py
index 91310fc2880fb..7d043ad52dac7 100644
--- a/test/deprecated/legacy_test/test_tensor_array_to_tensor.py
+++ b/test/deprecated/legacy_test/test_tensor_array_to_tensor.py
@@ -43,122 +43,6 @@ def test_list_Variable():
             self.assertRaises(TypeError, test_list_Variable)
 
 
-class TestLoDTensorArrayConcat(unittest.TestCase):
-    """Test case for concat mode of tensor_array_to_tensor."""
-
-    def setUp(self):
-        self.op_type = "tensor_array_to_tensor"
-        self.attrs = {"axis": 0}
-        self.outputs = ["Out"]
-
-    def test_get_set(self):
-        scope = core.Scope()
-        program = base.Program()
-        block = program.global_block()
-
-        input_arr = block.create_var(
-            name="tmp_lod_tensor_array",
-            type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
-        )
-        input_arr.persistable = True
-        input_arr_var = scope.var('tmp_lod_tensor_array')
-        input_tensor_array = input_arr_var.get_lod_tensor_array()
-        self.assertEqual(0, len(input_tensor_array))
-
-        cpu = core.CPUPlace()
-        for i in range(10):
-            t = core.LoDTensor()
-            if i == 0:
-                t.set(np.array([[i], [i]], dtype='float32'), cpu)
-            else:
-                t.set(np.array([[i]], dtype='float32'), cpu)
-            input_tensor_array.append(t)
-
-        self.assertEqual(10, len(input_tensor_array))
-
-        random_grad = np.random.random_sample([11]).astype(np.float32)
-
-        y_out = block.create_var(name="Out")
-        y_out.persistable = True
-        y_out_index = block.create_var(name="OutIndex")
-        y_out_index.persistable = True
-
-        y_grad_arr = block.create_var(
-            name='Out@GRAD', dtype='float32', shape=[11]
-        )
-        y_grad_arr.persistable = True
-        y_grad = scope.var('Out@GRAD')
-        y_grad_tensor = y_grad.get_tensor()
-        y_grad_tensor.set(random_grad, cpu)
-
-        op = block.append_op(
-            type=self.op_type,
-            inputs={"X": input_arr},
-            outputs={"Out": y_out, "OutIndex": y_out_index},
-            attrs=self.attrs,
-        )
-
-        out_grad = block.create_var(
-            name="tmp_lod_tensor_array@GRAD",
-            type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
-        )
-        out_grad.persistable = True
-
-        grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
-            op.desc, set(), []
-        )
-        grad_op_desc = grad_op_desc_list[0]
-        new_op_desc = block.desc.append_op()
-        new_op_desc.copy_from(grad_op_desc)
-        for var_name in grad_op_desc.output_arg_names():
-            block.desc.var(var_name.encode("ascii"))
-
-        grad_op_desc.infer_var_type(block.desc)
-        grad_op_desc.infer_shape(block.desc)
-        for arg in grad_op_desc.output_arg_names():
-            grad_var = block.desc.find_var(arg.encode("ascii"))
-            grad_var.set_dtype(core.VarDesc.VarType.FP32)
-
-        fetch_list = []
-        fetch_list.append(block.var('Out'))
-        fetch_list.append(block.var('OutIndex'))
-
-        exe = base.Executor(base.CPUPlace())
-        out = exe.run(program, fetch_list=fetch_list, scope=scope)
-        # print ("index: ", np.array(out[1]))
-
-        # test forward
-        tensor_res = np.array(out[0])
-        tensor_gt = np.array(
-            [0] + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='float32'
-        )
-
-        self.assertEqual(len(tensor_res), len(tensor_gt))
-
-        for i in range(len(tensor_res)):
-            self.assertEqual(tensor_res[i], tensor_gt[i])
-
-        # test backward
-        grad_tensor = scope.var('tmp_lod_tensor_array@GRAD')
-        grad_tensor_array = grad_tensor.get_lod_tensor_array()
-
-        self.assertEqual(10, len(grad_tensor_array))
-
-        for i in range(len(grad_tensor_array)):
-            if i == 0:
-                self.assertEqual(
-                    np.array(grad_tensor_array[i])[0], np.array(random_grad[i])
-                )
-                self.assertEqual(
-                    np.array(grad_tensor_array[i])[1],
-                    np.array(random_grad[i + 1]),
-                )
-            if i == 1:
-                self.assertEqual(
-                    np.array(grad_tensor_array[i]), np.array(random_grad[i + 1])
-                )
-
-
 class TestLoDTensorArrayStack(unittest.TestCase):
     """Test case for stack mode of tensor_array_to_tensor."""
 
diff --git a/test/deprecated/legacy_test/test_tensor_array_to_tensor_deprecated.py b/test/deprecated/legacy_test/test_tensor_array_to_tensor_deprecated.py
new file mode 100644
index 0000000000000..959eddee79fe6
--- /dev/null
+++ b/test/deprecated/legacy_test/test_tensor_array_to_tensor_deprecated.py
@@ -0,0 +1,143 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle import base
+from paddle.base import core
+
+paddle.enable_static()
+
+
+class TestLoDTensorArrayConcat(unittest.TestCase):
+    """Test case for concat mode of tensor_array_to_tensor."""
+
+    def setUp(self):
+        self.op_type = "tensor_array_to_tensor"
+        self.attrs = {"axis": 0}
+        self.outputs = ["Out"]
+
+    def test_get_set(self):
+        scope = core.Scope()
+        program = base.Program()
+        block = program.global_block()
+
+        input_arr = block.create_var(
+            name="tmp_lod_tensor_array",
+            type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+        )
+        input_arr.persistable = True
+        input_arr_var = scope.var('tmp_lod_tensor_array')
+        input_tensor_array = input_arr_var.get_lod_tensor_array()
+        self.assertEqual(0, len(input_tensor_array))
+
+        cpu = core.CPUPlace()
+        for i in range(10):
+            t = core.LoDTensor()
+            if i == 0:
+                t.set(np.array([[i], [i]], dtype='float32'), cpu)
+            else:
+                t.set(np.array([[i]], dtype='float32'), cpu)
+            input_tensor_array.append(t)
+
+        self.assertEqual(10, len(input_tensor_array))
+
+        random_grad = np.random.random_sample([11]).astype(np.float32)
+
+        y_out = block.create_var(name="Out")
+        y_out.persistable = True
+        y_out_index = block.create_var(name="OutIndex")
+        y_out_index.persistable = True
+
+        y_grad_arr = block.create_var(
+            name='Out@GRAD', dtype='float32', shape=[11]
+        )
+        y_grad_arr.persistable = True
+        y_grad = scope.var('Out@GRAD')
+        y_grad_tensor = y_grad.get_tensor()
+        y_grad_tensor.set(random_grad, cpu)
+
+        op = block.append_op(
+            type=self.op_type,
+            inputs={"X": input_arr},
+            outputs={"Out": y_out, "OutIndex": y_out_index},
+            attrs=self.attrs,
+        )
+
+        out_grad = block.create_var(
+            name="tmp_lod_tensor_array@GRAD",
+            type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+        )
+        out_grad.persistable = True
+
+        grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
+            op.desc, set(), []
+        )
+        grad_op_desc = grad_op_desc_list[0]
+        new_op_desc = block.desc.append_op()
+        new_op_desc.copy_from(grad_op_desc)
+        for var_name in grad_op_desc.output_arg_names():
+            block.desc.var(var_name.encode("ascii"))
+
+        grad_op_desc.infer_var_type(block.desc)
+        grad_op_desc.infer_shape(block.desc)
+        for arg in grad_op_desc.output_arg_names():
+            grad_var = block.desc.find_var(arg.encode("ascii"))
+            grad_var.set_dtype(core.VarDesc.VarType.FP32)
+
+        fetch_list = []
+        fetch_list.append(block.var('Out'))
+        fetch_list.append(block.var('OutIndex'))
+
+        exe = base.Executor(base.CPUPlace())
+        out = exe.run(program, fetch_list=fetch_list, scope=scope)
+        # print ("index: ", np.array(out[1]))
+
+        # test forward
+        tensor_res = np.array(out[0])
+        tensor_gt = np.array(
+            [0] + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='float32'
+        )
+
+        self.assertEqual(len(tensor_res), len(tensor_gt))
+
+        for i in range(len(tensor_res)):
+            self.assertEqual(tensor_res[i], tensor_gt[i])
+
+        # test backward
+        grad_tensor = scope.var('tmp_lod_tensor_array@GRAD')
+        grad_tensor_array = grad_tensor.get_lod_tensor_array()
+
+        self.assertEqual(10, len(grad_tensor_array))
+
+        for i in range(len(grad_tensor_array)):
+            if i == 0:
+                self.assertEqual(
+                    np.array(grad_tensor_array[i])[0], np.array(random_grad[i])
+                )
+                self.assertEqual(
+                    np.array(grad_tensor_array[i])[1],
+                    np.array(random_grad[i + 1]),
+                )
+            if i == 1:
+                self.assertEqual(
+                    np.array(grad_tensor_array[i]), np.array(random_grad[i + 1])
+                )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_trainable.py b/test/deprecated/legacy_test/test_trainable_deprecated.py
similarity index 100%
rename from test/deprecated/legacy_test/test_trainable.py
rename to test/deprecated/legacy_test/test_trainable_deprecated.py
diff --git a/test/deprecated/legacy_test/test_truncated_gaussian_random_op.py b/test/deprecated/legacy_test/test_truncated_gaussian_random_op_deprecated.py
similarity index 100%
rename from test/deprecated/legacy_test/test_truncated_gaussian_random_op.py
rename to test/deprecated/legacy_test/test_truncated_gaussian_random_op_deprecated.py
diff --git a/test/deprecated/legacy_test/test_variable.py b/test/deprecated/legacy_test/test_variable.py
index cbe52f6ef103e..5b33f4c6a4cc8 100644
--- a/test/deprecated/legacy_test/test_variable.py
+++ b/test/deprecated/legacy_test/test_variable.py
@@ -112,125 +112,6 @@ def test_step_scopes(self):
         )
         self.assertEqual(core.VarDesc.VarType.STEP_SCOPES, var.type)
 
-    def _test_slice(self, place):
-        b = default_main_program().current_block()
-        w = b.create_var(dtype="float64", shape=[784, 100, 100], lod_level=0)
-
-        for i in range(3):
-            nw = w[i]
-            self.assertEqual((100, 100), nw.shape)
-
-        nw = w[:]
-        self.assertEqual((784, 100, 100), nw.shape)
-
-        nw = w[:, :]
-        self.assertEqual((784, 100, 100), nw.shape)
-
-        nw = w[:, :, -1]
-        self.assertEqual((784, 100), nw.shape)
-
-        nw = w[1, 1, 1]
-
-        self.assertEqual(len(nw.shape), 0)
-
-        nw = w[:, :, :-1]
-        self.assertEqual((784, 100, 99), nw.shape)
-
-        self.assertEqual(0, nw.lod_level)
-
-        main = base.Program()
-        with base.program_guard(main):
-            exe = base.Executor(place)
-            tensor_array = np.array(
-                [
-                    [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
-                    [[10, 11, 12], [13, 14, 15], [16, 17, 18]],
-                    [[19, 20, 21], [22, 23, 24], [25, 26, 27]],
-                ]
-            ).astype('float32')
-            var = paddle.assign(tensor_array)
-            var1 = var[0, 1, 1]
-            var2 = var[1:]
-            var3 = var[0:1]
-            var4 = var[::-1]
-            var5 = var[1, 1:, 1:]
-            var_reshape = paddle.reshape(var, [3, -1, 3])
-            var6 = var_reshape[:, :, -1]
-            var7 = var[:, :, :-1]
-            var8 = var[:1, :1, :1]
-            var9 = var[:-1, :-1, :-1]
-            var10 = var[::-1, :1, :-1]
-            var11 = var[:-1, ::-1, -1:]
-            var12 = var[1:2, 2:, ::-1]
-            var13 = var[2:10, 2:, -2:-1]
-            var14 = var[1:-1, 0:2, ::-1]
-            var15 = var[::-1, ::-1, ::-1]
-
-            x = paddle.static.data(name='x', shape=[-1, 13], dtype='float32')
-            y = paddle.static.nn.fc(x, size=1, activation=None)
-            y_1 = y[:, 0]
-            feeder = base.DataFeeder(place=place, feed_list=[x])
-            data = []
-            data.append(np.random.randint(10, size=[13]).astype('float32'))
-            exe.run(base.default_startup_program())
-
-            local_out = exe.run(
-                main,
-                feed=feeder.feed([data]),
-                fetch_list=[
-                    var,
-                    var1,
-                    var2,
-                    var3,
-                    var4,
-                    var5,
-                    var6,
-                    var7,
-                    var8,
-                    var9,
-                    var10,
-                    var11,
-                    var12,
-                    var13,
-                    var14,
-                    var15,
-                ],
-            )
-
-            np.testing.assert_array_equal(local_out[1], tensor_array[0, 1, 1:2])
-            np.testing.assert_array_equal(local_out[2], tensor_array[1:])
-            np.testing.assert_array_equal(local_out[3], tensor_array[0:1])
-            np.testing.assert_array_equal(local_out[4], tensor_array[::-1])
-            np.testing.assert_array_equal(local_out[5], tensor_array[1, 1:, 1:])
-            np.testing.assert_array_equal(
-                local_out[6], tensor_array.reshape((3, -1, 3))[:, :, -1]
-            )
-            np.testing.assert_array_equal(local_out[7], tensor_array[:, :, :-1])
-            np.testing.assert_array_equal(
-                local_out[8], tensor_array[:1, :1, :1]
-            )
-            np.testing.assert_array_equal(
-                local_out[9], tensor_array[:-1, :-1, :-1]
-            )
-            np.testing.assert_array_equal(
-                local_out[10], tensor_array[::-1, :1, :-1]
-            )
-            np.testing.assert_array_equal(
-                local_out[11], tensor_array[:-1, ::-1, -1:]
-            )
-            np.testing.assert_array_equal(
-                local_out[12], tensor_array[1:2, 2:, ::-1]
-            )
-            np.testing.assert_array_equal(
-                local_out[13], tensor_array[2:10, 2:, -2:-1]
-            )
-            np.testing.assert_array_equal(
-                local_out[14], tensor_array[1:-1, 0:2, ::-1]
-            )
-            np.testing.assert_array_equal(
-                local_out[15], tensor_array[::-1, ::-1, ::-1]
-            )
-
     def _test_slice_index_tensor(self, place):
         data = np.random.rand(2, 3).astype("float32")
         prog = paddle.static.Program()
@@ -391,7 +272,6 @@ def test_slice(self):
             places.append(core.CUDAPlace(0))
 
         for place in places:
-            self._test_slice(place)
             self._test_slice_index_tensor(place)
             self._test_slice_index_list(place)
             self._test_slice_index_ellipsis(place)
diff --git a/test/deprecated/legacy_test/test_variable_deprecated.py b/test/deprecated/legacy_test/test_variable_deprecated.py
new file mode 100644
index 0000000000000..6d416d1a20344
--- /dev/null
+++ b/test/deprecated/legacy_test/test_variable_deprecated.py
@@ -0,0 +1,162 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle import base
+from paddle.base import core
+from paddle.base.framework import (
+    default_main_program,
+)
+
+paddle.enable_static()
+
+
+class TestVariable(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(2022)
+
+    def _test_slice(self, place):
+        b = default_main_program().current_block()
+        w = b.create_var(dtype="float64", shape=[784, 100, 100], lod_level=0)
+
+        for i in range(3):
+            nw = w[i]
+            self.assertEqual((100, 100), nw.shape)
+
+        nw = w[:]
+        self.assertEqual((784, 100, 100), nw.shape)
+
+        nw = w[:, :]
+        self.assertEqual((784, 100, 100), nw.shape)
+
+        nw = w[:, :, -1]
+        self.assertEqual((784, 100), nw.shape)
+
+        nw = w[1, 1, 1]
+
+        self.assertEqual(len(nw.shape), 0)
+
+        nw = w[:, :, :-1]
+        self.assertEqual((784, 100, 99), nw.shape)
+
+        self.assertEqual(0, nw.lod_level)
+
+        main = base.Program()
+        with base.program_guard(main):
+            exe = base.Executor(place)
+            tensor_array = np.array(
+                [
+                    [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+                    [[10, 11, 12], [13, 14, 15], [16, 17, 18]],
+                    [[19, 20, 21], [22, 23, 24], [25, 26, 27]],
+                ]
+            ).astype('float32')
+            var = paddle.assign(tensor_array)
+            var1 = var[0, 1, 1]
+            var2 = var[1:]
+            var3 = var[0:1]
+            var4 = var[::-1]
+            var5 = var[1, 1:, 1:]
+            var_reshape = paddle.reshape(var, [3, -1, 3])
+            var6 = var_reshape[:, :, -1]
+            var7 = var[:, :, :-1]
+            var8 = var[:1, :1, :1]
+            var9 = var[:-1, :-1, :-1]
+            var10 = var[::-1, :1, :-1]
+            var11 = var[:-1, ::-1, -1:]
+            var12 = var[1:2, 2:, ::-1]
+            var13 = var[2:10, 2:, -2:-1]
+            var14 = var[1:-1, 0:2, ::-1]
+            var15 = var[::-1, ::-1, ::-1]
+
+            x = paddle.static.data(name='x', shape=[-1, 13], dtype='float32')
+            y = paddle.static.nn.fc(x, size=1, activation=None)
+            y_1 = y[:, 0]
+            feeder = base.DataFeeder(place=place, feed_list=[x])
+            data = []
+            data.append(np.random.randint(10, size=[13]).astype('float32'))
+            exe.run(base.default_startup_program())
+
+            local_out = exe.run(
+                main,
+                feed=feeder.feed([data]),
+                fetch_list=[
+                    var,
+                    var1,
+                    var2,
+                    var3,
+                    var4,
+                    var5,
+                    var6,
+                    var7,
+                    var8,
+                    var9,
+                    var10,
+                    var11,
+                    var12,
+                    var13,
+                    var14,
+                    var15,
+                ],
+            )
+
+            np.testing.assert_array_equal(local_out[1], tensor_array[0, 1, 1:2])
+            np.testing.assert_array_equal(local_out[2], tensor_array[1:])
+            np.testing.assert_array_equal(local_out[3], tensor_array[0:1])
+            np.testing.assert_array_equal(local_out[4], tensor_array[::-1])
+            np.testing.assert_array_equal(local_out[5], tensor_array[1, 1:, 1:])
+            np.testing.assert_array_equal(
+                local_out[6], tensor_array.reshape((3, -1, 3))[:, :, -1]
+            )
+            np.testing.assert_array_equal(local_out[7], tensor_array[:, :, :-1])
+            np.testing.assert_array_equal(
+                local_out[8], tensor_array[:1, :1, :1]
+            )
+            np.testing.assert_array_equal(
+                local_out[9], tensor_array[:-1, :-1, :-1]
+            )
+            np.testing.assert_array_equal(
+                local_out[10], tensor_array[::-1, :1, :-1]
+            )
+            np.testing.assert_array_equal(
+                local_out[11], tensor_array[:-1, ::-1, -1:]
+            )
+            np.testing.assert_array_equal(
+                local_out[12], tensor_array[1:2, 2:, ::-1]
+            )
+            np.testing.assert_array_equal(
+                local_out[13], tensor_array[2:10, 2:, -2:-1]
+            )
+            np.testing.assert_array_equal(
+                local_out[14], tensor_array[1:-1, 0:2, ::-1]
+            )
+            np.testing.assert_array_equal(
+                local_out[15], tensor_array[::-1, ::-1, ::-1]
+            )
+
+    def test_slice(self):
+        places = [base.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+
+        for place in places:
+            self._test_slice(place)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_weight_normalization.py b/test/deprecated/legacy_test/test_weight_normalization_deprecated.py
similarity index 99%
rename from test/deprecated/legacy_test/test_weight_normalization.py
rename to test/deprecated/legacy_test/test_weight_normalization_deprecated.py
index f8793aef3052e..6e799166b4d42 100644
--- a/test/deprecated/legacy_test/test_weight_normalization.py
+++ b/test/deprecated/legacy_test/test_weight_normalization_deprecated.py
@@ -22,6 +22,8 @@
 from paddle.base import core
 from paddle.base.param_attr import WeightNormParamAttr
 
+paddle.enable_static()
+
 
 class TestWeightNormalization(unittest.TestCase):
     batch_size = 3
diff --git a/test/deprecated/prim/composite_ops/test_composite_layer_norm.py b/test/deprecated/prim/composite_ops/test_composite_layer_norm_deprecated.py
similarity index 100%
rename from test/deprecated/prim/composite_ops/test_composite_layer_norm.py
rename to test/deprecated/prim/composite_ops/test_composite_layer_norm_deprecated.py
diff --git a/test/deprecated/prim/pir_prim/CMakeLists.txt b/test/deprecated/prim/pir_prim/CMakeLists.txt
index 340b94fc53c95..15d788ccff424 100644
--- a/test/deprecated/prim/pir_prim/CMakeLists.txt
+++ b/test/deprecated/prim/pir_prim/CMakeLists.txt
@@ -1,5 +1,6 @@
-set(TEST_PRIM_TRANS_PIR_CASES test_custom_vjp_trait test_decomp_op
-                              test_decompose_op test_vjp_prim)
+set(TEST_PRIM_TRANS_PIR_CASES
+    test_custom_vjp_trait test_decomp_op test_decompose_op test_vjp_prim
+    test_batch_norm_shape_check)
 
 foreach(target ${TEST_PRIM_TRANS_PIR_CASES})
   py_test_modules(${target} MODULES ${target} ENVS GLOG_v=1
diff --git a/test/deprecated/prim/pir_prim/test_batch_norm_shape_check.py b/test/deprecated/prim/pir_prim/test_batch_norm_shape_check.py
new file mode 100644
index 0000000000000..045a88695d9e3
--- /dev/null
+++ b/test/deprecated/prim/pir_prim/test_batch_norm_shape_check.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle import pir
+from paddle.decomposition import decompose
+from paddle.framework import core
+
+paddle.enable_static()
+
+
+def batch_norm_net1(x, r_m, r_v, w, b):
+    return paddle.nn.functional.batch_norm(x, r_m, r_v, w, b, training=False)
+
+
+class TestBuildOp(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(2023)
+        self.dtype = "float32"
+        self.x_shape = [1, 64, 512, 1024]
+        self.c_shape = [64]
+        self.dtype_x = "float32"
+        self.init_x_shape = [1, 64, 512, 1024]
+        self.x = np.random.random(self.x_shape).astype(self.dtype_x)
+        self.r_m = np.random.random(self.x_shape[1]).astype(self.dtype)
+        self.r_v = np.random.random(self.x_shape[1]).astype(self.dtype)
+        self.w = np.random.random(self.x_shape[1]).astype(self.dtype)
+        self.b = np.random.random(self.x_shape[1]).astype(self.dtype)
+        self.net = batch_norm_net1
+        self.necessary_ops = "pd_op.batch_norm"
+        self.enable_cinn = False
+        self.tol = 5e-6
+
+    def get_ir_program(self):
+        paddle.enable_static()
+        x = paddle.randn([4, 4])
+        main_program, start_program = (
+            paddle.static.Program(),
+            paddle.static.Program(),
+        )
+        with paddle.static.program_guard(main_program, start_program):
+            x = paddle.static.data('x', self.x_shape, x.dtype)
+            x.stop_gradients = False
+            r_m = paddle.static.data('r_m', self.c_shape, x.dtype)
+            r_v = paddle.static.data('r_v', self.c_shape, x.dtype)
+            w = paddle.static.data('w', self.c_shape, x.dtype)
+            b = paddle.static.data('b', self.c_shape, x.dtype)
+            y = batch_norm_net1(x, r_m, r_v, w, b)
+            res = paddle.tanh(y)
+        pir_program = pir.translate_to_pir(main_program.desc)
+        return pir_program
+
+    def test_build_op(self):
+        pir_program = self.get_ir_program()
+        y = pir_program.global_block().ops[-2].results()
+        orig_shape = y[0].shape
+        with paddle.pir_utils.IrGuard():
+            core._set_prim_forward_enabled(True)
+            y_new = decompose(pir_program, y)
+            core._set_prim_forward_enabled(False)
+            new_shape = y_new[0].shape
+            assert (
+                orig_shape == new_shape
+            ), f"Original shape {orig_shape} is not equal to new shape {new_shape}"
+            op_name_list = [op.name() for op in pir_program.global_block().ops]
+            assert "pd_op.batch_norm_" not in op_name_list
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/deprecated/prim/test_comp_get_grad_op_desc_prim_enabled.py b/test/deprecated/prim/test_comp_get_grad_op_desc_prim_enabled_deprecated.py
similarity index 100%
rename from test/deprecated/prim/test_comp_get_grad_op_desc_prim_enabled.py
rename to test/deprecated/prim/test_comp_get_grad_op_desc_prim_enabled_deprecated.py
diff --git a/test/deprecated/quantization/CMakeLists.txt b/test/deprecated/quantization/CMakeLists.txt
index 5fc3911d0417f..0ab38193a8c09 100644
--- a/test/deprecated/quantization/CMakeLists.txt
+++ b/test/deprecated/quantization/CMakeLists.txt
@@ -193,7 +193,7 @@ if(WIN32)
   list(REMOVE_ITEM TEST_OPS test_imperative_qat_amp)
   list(REMOVE_ITEM TEST_OPS test_weight_only_linear)
   list(REMOVE_ITEM TEST_OPS test_llm_int8_linear)
-  list(REMOVE_ITEM TEST_OPS test_quant_aware)
+  list(REMOVE_ITEM TEST_OPS test_quant_aware_deprecated)
   list(REMOVE_ITEM TEST_OPS test_quant_post_quant_aware)
   list(REMOVE_ITEM TEST_OPS test_quant_aware_user_defined)
   list(REMOVE_ITEM TEST_OPS test_quant_amp)
@@ -236,14 +236,14 @@ list(REMOVE_ITEM TEST_OPS test_filter_pruning)
 # fix
 if(WIN32)
   set(SINGLE_CARD_TEST_OPS
-      test_user_defined_quantization
-      test_quantization_scale_pass
-      test_quantization_pass
-      test_moving_average_abs_max_scale_op
+      test_user_defined_quantization_deprecated
+      test_quantization_scale_pass_deprecated
+      test_quantization_pass_deprecated
+      test_moving_average_abs_max_scale_op_deprecated
       test_imperative_qat_channelwise
       test_imperative_qat
       test_imperative_out_scale
-      test_graph)
+      test_graph_deprecated)
   list(REMOVE_ITEM TEST_OPS ${SINGLE_CARD_TEST_OPS})
   foreach(src ${SINGLE_CARD_TEST_OPS})
     py_test(${src} SRCS ${src}.py ENVS CUDA_VISIBLE_DEVICES=0)
@@ -260,16 +260,17 @@ if(NOT WIN32)
                                                                         120)
   set_tests_properties(test_weight_quantization_mobilenetv1 PROPERTIES TIMEOUT
                                                                        120)
-  set_tests_properties(test_quant_aware PROPERTIES TIMEOUT 200)
+  set_tests_properties(test_quant_aware_deprecated PROPERTIES TIMEOUT 200)
   set_tests_properties(test_quant_post_quant_aware PROPERTIES TIMEOUT 200)
   set_tests_properties(test_quant_aware_user_defined PROPERTIES TIMEOUT 200)
   set_tests_properties(test_quant_amp PROPERTIES TIMEOUT 200)
 endif()
 
-set_tests_properties(test_graph PROPERTIES TIMEOUT 120)
-set_tests_properties(test_quantization_pass PROPERTIES TIMEOUT 120)
+set_tests_properties(test_graph_deprecated PROPERTIES TIMEOUT 120)
+set_tests_properties(test_quantization_pass_deprecated PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_qat_channelwise PROPERTIES TIMEOUT 200)
-set_tests_properties(test_user_defined_quantization PROPERTIES TIMEOUT 200)
+set_tests_properties(test_user_defined_quantization_deprecated
+                     PROPERTIES TIMEOUT 200)
 set_tests_properties(test_imperative_qat PROPERTIES TIMEOUT 200)
 set_tests_properties(test_imperative_qat_fuse PROPERTIES TIMEOUT 200)
 set_tests_properties(test_imperative_out_scale PROPERTIES TIMEOUT 200)
@@ -279,3 +280,6 @@ if(APPLE)
                                                                         300)
   set_tests_properties(test_imperative_skip_op PROPERTIES TIMEOUT 300)
 endif()
+
+set_tests_properties(test_quantization_scale_pass_deprecated PROPERTIES TIMEOUT
+                                                                        100)
diff --git a/test/deprecated/quantization/test_graph.py b/test/deprecated/quantization/test_graph_deprecated.py
similarity index 100%
rename from test/deprecated/quantization/test_graph.py
rename to test/deprecated/quantization/test_graph_deprecated.py
diff --git a/test/deprecated/quantization/test_moving_average_abs_max_scale_op.py b/test/deprecated/quantization/test_moving_average_abs_max_scale_op_deprecated.py
similarity index 100%
rename from test/deprecated/quantization/test_moving_average_abs_max_scale_op.py
rename to test/deprecated/quantization/test_moving_average_abs_max_scale_op_deprecated.py
diff --git a/test/deprecated/quantization/test_quant2_int8_mkldnn_pass.py b/test/deprecated/quantization/test_quant2_int8_mkldnn_pass_deprecated.py
similarity index 100%
rename from test/deprecated/quantization/test_quant2_int8_mkldnn_pass.py
rename to test/deprecated/quantization/test_quant2_int8_mkldnn_pass_deprecated.py
diff --git a/test/deprecated/quantization/test_quant_amp.py b/test/deprecated/quantization/test_quant_amp.py
index 2f285dfdf07d9..b708355a54827 100644
--- a/test/deprecated/quantization/test_quant_amp.py
+++ b/test/deprecated/quantization/test_quant_amp.py
@@ -15,10 +15,12 @@
 
 import logging
 import os
+import sys
 import unittest
 
+sys.path.append(".")
 import numpy as np
-from test_quant_aware import MobileNet
+from test_quant_aware_deprecated import MobileNet
 
 import paddle
 from paddle.static.quantization.quanter import convert, quant_aware
diff --git a/test/deprecated/quantization/test_quant_aware.py b/test/deprecated/quantization/test_quant_aware_deprecated.py
similarity index 100%
rename from test/deprecated/quantization/test_quant_aware.py
rename to test/deprecated/quantization/test_quant_aware_deprecated.py
diff --git a/test/deprecated/quantization/test_quant_aware_user_defined.py b/test/deprecated/quantization/test_quant_aware_user_defined.py
index 3521ecf7ddeff..124836f560e6a 100644
--- a/test/deprecated/quantization/test_quant_aware_user_defined.py
+++ b/test/deprecated/quantization/test_quant_aware_user_defined.py
@@ -13,10 +13,15 @@
 # limitations under the License.
 import logging
 import os
+import sys
 import unittest
 
+sys.path.append(".")
 import numpy as np
-from test_quant_aware import MobileNet, StaticCase
+from test_quant_aware_deprecated import (
+    MobileNet,
+    StaticCase,
+)
 
 import paddle
 from paddle.static.quantization.quanter import convert, quant_aware
diff --git a/test/deprecated/quantization/test_quant_post_quant_aware.py b/test/deprecated/quantization/test_quant_post_quant_aware.py
index 0fe582306fbd7..db9e0a857f9d9 100644
--- a/test/deprecated/quantization/test_quant_post_quant_aware.py
+++ b/test/deprecated/quantization/test_quant_post_quant_aware.py
@@ -14,10 +14,12 @@
 
 import logging
 import random
+import sys
 import unittest
 
+sys.path.append(".")
 import numpy as np
-from test_quant_aware import StaticCase
+from test_quant_aware_deprecated import StaticCase
 
 import paddle
 from paddle.static.quantization.quanter import convert, quant_aware
diff --git a/test/deprecated/quantization/test_quantization_mkldnn_pass.py b/test/deprecated/quantization/test_quantization_mkldnn_pass_deprecated.py
similarity index 100%
rename from test/deprecated/quantization/test_quantization_mkldnn_pass.py
rename to test/deprecated/quantization/test_quantization_mkldnn_pass_deprecated.py
diff --git a/test/deprecated/quantization/test_quantization_pass.py b/test/deprecated/quantization/test_quantization_pass_deprecated.py
similarity index 100%
rename from test/deprecated/quantization/test_quantization_pass.py
rename to test/deprecated/quantization/test_quantization_pass_deprecated.py
diff --git a/test/deprecated/quantization/test_quantization_scale_pass.py b/test/deprecated/quantization/test_quantization_scale_pass_deprecated.py
similarity index 100%
rename from test/deprecated/quantization/test_quantization_scale_pass.py
rename to test/deprecated/quantization/test_quantization_scale_pass_deprecated.py
diff --git a/test/deprecated/quantization/test_user_defined_quantization.py b/test/deprecated/quantization/test_user_defined_quantization_deprecated.py
similarity index 100%
rename from test/deprecated/quantization/test_user_defined_quantization.py
rename to test/deprecated/quantization/test_user_defined_quantization_deprecated.py
diff --git a/test/deprecated/rnn/CMakeLists.txt b/test/deprecated/rnn/CMakeLists.txt
index 04773499b3591..a06731560086d 100644
--- a/test/deprecated/rnn/CMakeLists.txt
+++ b/test/deprecated/rnn/CMakeLists.txt
@@ -9,5 +9,7 @@ foreach(TEST_OP ${TEST_OPS})
 endforeach()
 if(NOT WIN32)
   set_tests_properties(test_rnn_nets_static PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_rnn_nets_static_deprecated PROPERTIES TIMEOUT 120)
   set_tests_properties(test_rnn_nets PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_rnn_nets_deprecated PROPERTIES TIMEOUT 120)
 endif()
diff --git a/test/deprecated/rnn/test_rnn_nets.py b/test/deprecated/rnn/test_rnn_nets.py
index f87424245ce81..6cec726472c58 100644
--- a/test/deprecated/rnn/test_rnn_nets.py
+++ b/test/deprecated/rnn/test_rnn_nets.py
@@ -95,37 +95,12 @@ def test_with_zero_state(self):
         np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
 
-    def test_with_input_lengths(self):
-        rnn1 = self.rnn1
-        rnn2 = self.rnn2
-
-        x = np.random.randn(12, 4, 16)
-        if not self.time_major:
-            x = np.transpose(x, [1, 0, 2])
-        sequence_length = np.array([12, 10, 9, 8], dtype=np.int64)
-
-        y1, h1 = rnn1(x, sequence_length=sequence_length)
-
-        seq_len = paddle.to_tensor(sequence_length)
-        mask = paddle.static.nn.sequence_lod.sequence_mask(
-            seq_len, dtype=paddle.get_default_dtype()
-        )
-        if self.time_major:
-            mask = paddle.transpose(mask, [1, 0])
-        y2, h2 = rnn2(paddle.to_tensor(x), sequence_length=seq_len)
-        mask = paddle.unsqueeze(mask, -1)
-        y2 = paddle.multiply(y2, mask)
-
-        np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
-        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
-
     def test_predict(self):
         predict_test_util(self.place, "SimpleRNN")
 
     def runTest(self):
         self.test_with_initial_state()
         self.test_with_zero_state()
-        self.test_with_input_lengths()
         self.test_predict()
 
 
@@ -180,37 +155,12 @@ def test_with_zero_state(self):
         np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
 
-    def test_with_input_lengths(self):
-        rnn1 = self.rnn1
-        rnn2 = self.rnn2
-
-        x = np.random.randn(12, 4, 16)
-        if not self.time_major:
-            x = np.transpose(x, [1, 0, 2])
-        sequence_length = np.array([12, 10, 9, 8], dtype=np.int64)
-
-        y1, h1 = rnn1(x, sequence_length=sequence_length)
-
-        seq_len = paddle.to_tensor(sequence_length)
-        mask = paddle.static.nn.sequence_lod.sequence_mask(
-            seq_len, dtype=paddle.get_default_dtype()
-        )
-        if self.time_major:
-            mask = paddle.transpose(mask, [1, 0])
-        y2, h2 = rnn2(paddle.to_tensor(x), sequence_length=seq_len)
-        mask = paddle.unsqueeze(mask, -1)
-        y2 = paddle.multiply(y2, mask)
-
-        np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
-        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
-
     def test_predict(self):
         predict_test_util(self.place, "GRU")
 
     def runTest(self):
         self.test_with_initial_state()
         self.test_with_zero_state()
-        self.test_with_input_lengths()
         self.test_predict()
 
 
@@ -273,31 +223,6 @@ def test_with_zero_state(self):
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5)
 
-    def test_with_input_lengths(self):
-        rnn1 = self.rnn1
-        rnn2 = self.rnn2
-
-        x = np.random.randn(12, 4, 16)
-        if not self.time_major:
-            x = np.transpose(x, [1, 0, 2])
-        sequence_length = np.array([12, 10, 9, 8], dtype=np.int64)
-
-        y1, (h1, c1) = rnn1(x, sequence_length=sequence_length)
-
-        seq_len = paddle.to_tensor(sequence_length)
-        mask = paddle.static.nn.sequence_lod.sequence_mask(
-            seq_len, dtype=paddle.get_default_dtype()
-        )
-        if self.time_major:
-            mask = paddle.transpose(mask, [1, 0])
-        y2, (h2, c2) = rnn2(paddle.to_tensor(x), sequence_length=seq_len)
-        mask = paddle.unsqueeze(mask, -1)
-        y2 = paddle.multiply(y2, mask)
-
-        np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
-        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
-        np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5)
-
     def test_predict(self):
         predict_test_util(self.place, "LSTM")
         predict_test_util(self.place, "LSTM", False)
@@ -305,7 +230,6 @@ def test_predict(self):
     def runTest(self):
         self.test_with_initial_state()
         self.test_with_zero_state()
-        self.test_with_input_lengths()
         self.test_predict()
 
 
diff --git a/test/deprecated/rnn/test_rnn_nets_deprecated.py b/test/deprecated/rnn/test_rnn_nets_deprecated.py
new file mode 100644
index 0000000000000..ee435a1235ef7
--- /dev/null
+++ b/test/deprecated/rnn/test_rnn_nets_deprecated.py
@@ -0,0 +1,327 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+
+paddle.set_default_dtype("float64")
+import os
+import sys
+import tempfile
+import unittest
+
+import numpy as np
+from convert import convert_params_for_net
+
+from paddle.pir_utils import test_with_dygraph_pir
+
+sys.path.append("../../rnn")
+from rnn_numpy import GRU, LSTM, SimpleRNN
+
+bidirectional_list = ["bidirectional", "bidirect"]
+
+
+class TestSimpleRNN(unittest.TestCase):
+    def __init__(
+        self, time_major=True, direction="forward", place="cpu", mode='RNN_TANH'
+    ):
+        super().__init__("runTest")
+        self.time_major = time_major
+        self.direction = direction
+        self.num_directions = 2 if direction in bidirectional_list else 1
+        self.place = place
+        self.mode = mode
+
+    def setUp(self):
+        # Since `set_device` is global, set `set_device` in `setUp` rather than
+        # `__init__` to avoid using an error device set by another test case.
+        place = paddle.set_device(self.place)
+        paddle.disable_static(place)
+        rnn1 = SimpleRNN(
+            16,
+            32,
+            2,
+            time_major=self.time_major,
+            direction=self.direction,
+            nonlinearity=self.mode,
+        )
+        rnn2 = paddle.nn.SimpleRNN(
+            16,
+            32,
+            2,
+            time_major=self.time_major,
+            direction=self.direction,
+            activation=self.mode[4:].lower(),
+        )
+        convert_params_for_net(rnn1, rnn2)
+
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+    def test_with_input_lengths(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        sequence_length = np.array([12, 10, 9, 8], dtype=np.int64)
+
+        y1, h1 = rnn1(x, sequence_length=sequence_length)
+
+        seq_len = paddle.to_tensor(sequence_length)
+        mask = paddle.static.nn.sequence_lod.sequence_mask(
+            seq_len, dtype=paddle.get_default_dtype()
+        )
+        if self.time_major:
+            mask = paddle.transpose(mask, [1, 0])
+        y2, h2 = rnn2(paddle.to_tensor(x), sequence_length=seq_len)
+        mask = paddle.unsqueeze(mask, -1)
+        y2 = paddle.multiply(y2, mask)
+
+        np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_input_lengths()
+
+
+class TestGRU(unittest.TestCase):
+    def __init__(self, time_major=True, direction="forward", place="cpu"):
+        super().__init__("runTest")
+        self.time_major = time_major
+        self.direction = direction
+        self.num_directions = 2 if direction in bidirectional_list else 1
+        self.place = place
+
+    def setUp(self):
+        # Since `set_device` is global, set `set_device` in `setUp` rather than
+        # `__init__` to avoid using an error device set by another test case.
+        place = paddle.set_device(self.place)
+        paddle.disable_static(place)
+        rnn1 = GRU(
+            16, 32, 2, time_major=self.time_major, direction=self.direction
+        )
+        rnn2 = paddle.nn.GRU(
+            16, 32, 2, time_major=self.time_major, direction=self.direction
+        )
+        convert_params_for_net(rnn1, rnn2)
+
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+    def test_with_input_lengths(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        sequence_length = np.array([12, 10, 9, 8], dtype=np.int64)
+
+        y1, h1 = rnn1(x, sequence_length=sequence_length)
+
+        seq_len = paddle.to_tensor(sequence_length)
+        mask = paddle.static.nn.sequence_lod.sequence_mask(
+            seq_len, dtype=paddle.get_default_dtype()
+        )
+        if self.time_major:
+            mask = paddle.transpose(mask, [1, 0])
+        y2, h2 = rnn2(paddle.to_tensor(x), sequence_length=seq_len)
+        mask = paddle.unsqueeze(mask, -1)
+        y2 = paddle.multiply(y2, mask)
+
+        np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_input_lengths()
+
+
+class TestLSTM(unittest.TestCase):
+    def __init__(self, time_major=True, direction="forward", place="cpu"):
+        super().__init__("runTest")
+        self.time_major = time_major
+        self.direction = direction
+        self.num_directions = 2 if direction in bidirectional_list else 1
+        self.place = place
+
+    def setUp(self):
+        # Since `set_device` is global, set `set_device` in `setUp` rather than
+        # `__init__` to avoid using an error device set by another test case.
+        place = paddle.set_device(self.place)
+        paddle.disable_static(place)
+        rnn1 = LSTM(
+            16, 32, 2, time_major=self.time_major, direction=self.direction
+        )
+        rnn2 = paddle.nn.LSTM(
+            16, 32, 2, time_major=self.time_major, direction=self.direction
+        )
+        convert_params_for_net(rnn1, rnn2)
+
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+    def test_with_input_lengths(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        sequence_length = np.array([12, 10, 9, 8], dtype=np.int64)
+
+        y1, (h1, c1) = rnn1(x, sequence_length=sequence_length)
+
+        seq_len = paddle.to_tensor(sequence_length)
+        mask = paddle.static.nn.sequence_lod.sequence_mask(
+            seq_len, dtype=paddle.get_default_dtype()
+        )
+        if self.time_major:
+            mask = paddle.transpose(mask, [1, 0])
+        y2, (h2, c2) = rnn2(paddle.to_tensor(x), sequence_length=seq_len)
+        mask = paddle.unsqueeze(mask, -1)
+        y2 = paddle.multiply(y2, mask)
+
+        np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_input_lengths()
+
+
+class TestLSTMWithProjSize(TestLSTM):
+    def setUp(self):
+        # Since `set_device` is global, set `set_device` in `setUp` rather than
+        # `__init__` to avoid using an error device set by another test case.
+        place = paddle.set_device(self.place)
+        paddle.disable_static(place)
+        rnn1 = LSTM(
+            16,
+            32,
+            2,
+            time_major=self.time_major,
+            direction=self.direction,
+            proj_size=8,
+        )
+        rnn2 = paddle.nn.LSTM(
+            16,
+            32,
+            2,
+            time_major=self.time_major,
+            direction=self.direction,
+            proj_size=8,
+        )
+        convert_params_for_net(rnn1, rnn2)
+
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+        self.proj_size = 8
+
+
+@test_with_dygraph_pir
+def predict_test_util(place, mode, stop_gradient=True):
+    place = paddle.set_device(place)
+    paddle.seed(123)
+    np.random.seed(123)
+
+    class Net(paddle.nn.Layer):
+        def __init__(self):
+            super().__init__()
+            self.rnn = getattr(paddle.nn, mode)(
+                16, 32, 2, direction="bidirectional", dropout=0.1
+            )
+
+        def forward(self, input):
+            return self.rnn(input)
+
+    x = paddle.randn((4, 10, 16))
+    x.stop_gradient = stop_gradient
+    seq_len = paddle.to_tensor(np.array([10, 6, 8, 5]))
+    mask = paddle.static.nn.sequence_lod.sequence_mask(
+        seq_len, maxlen=10, dtype=x.dtype
+    )
+    mask = paddle.unsqueeze(mask, [2])
+    rnn = Net()
+    y, _ = rnn(x)
+    y = y * mask
+    loss = paddle.mean(y)
+    loss.backward()
+    optimizer = paddle.optimizer.Adam(
+        learning_rate=0.1, parameters=rnn.parameters()
+    )
+    optimizer.step()
+    rnn.eval()
+    y, _ = rnn(x)
+    # `jit.to_static` would include a train_program, eval mode might cause
+    # some errors currently, such as dropout grad op gets `is_test == True`.
+    rnn.train()
+
+    rnn = paddle.jit.to_static(
+        rnn,
+        [paddle.static.InputSpec(shape=[None, None, 16], dtype=x.dtype)],
+        full_graph=True,
+    )
+    temp_dir = tempfile.TemporaryDirectory()
+    save_dirname = os.path.join(temp_dir.name, "./inference/%s_infer" % mode)
+
+    paddle.jit.save(rnn, save_dirname)
+
+    paddle.enable_static()
+
+    new_scope = paddle.static.Scope()
+    with paddle.static.scope_guard(new_scope):
+        exe = paddle.static.Executor(place)
+        [
+            inference_program,
+            feed_target_names,
+            fetch_targets,
+        ] = paddle.static.load_inference_model(save_dirname, exe)
+        results = exe.run(
+            inference_program,
+            feed={feed_target_names[0]: x.numpy()},
+            fetch_list=fetch_targets,
+        )
+        np.testing.assert_equal(
+            y.numpy(), results[0]
+        )  # eval results equal predict results
+    paddle.disable_static()
+
+    temp_dir.cleanup()
+
+
+def load_tests(loader, tests, pattern):
+    suite = unittest.TestSuite()
+    devices = ["cpu", "gpu"] if paddle.base.is_compiled_with_cuda() else ["cpu"]
+    for direction in ["forward", "bidirectional", "bidirect"]:
+        for time_major in [True, False]:
+            for device in devices:
+                for test_class in [
+                    TestSimpleRNN,
+                    TestLSTM,
+                    TestGRU,
+                    TestLSTMWithProjSize,
+                ]:
+                    suite.addTest(test_class(time_major, direction, device))
+                    if test_class == TestSimpleRNN:
+                        suite.addTest(
+                            test_class(
+                                time_major, direction, device, mode="RNN_RELU"
+                            )
+                        )
+    return suite
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/deprecated/rnn/test_rnn_nets_static.py b/test/deprecated/rnn/test_rnn_nets_static.py
index da00c37682fae..3ccdad1dfc71e 100644
--- a/test/deprecated/rnn/test_rnn_nets_static.py
+++ b/test/deprecated/rnn/test_rnn_nets_static.py
@@ -150,50 +150,9 @@ def test_with_zero_state(self):
         np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
 
-    def test_with_input_lengths(self):
-        mp = self.mp.clone()
-        sp = self.sp
-        rnn1 = self.rnn1
-        rnn2 = self.rnn2
-        exe = self.executor
-        scope = self.scope
-
-        x = np.random.randn(12, 4, 16)
-        if not self.time_major:
-            x = np.transpose(x, [1, 0, 2])
-        sequence_length = np.array([12, 10, 9, 8], dtype=np.int64)
-
-        y1, h1 = rnn1(x, sequence_length=sequence_length)
-
-        with paddle.base.unique_name.guard():
-            with paddle.static.program_guard(mp, sp):
-                x_data = paddle.static.data(
-                    "input",
-                    [-1, -1, 16],
-                    dtype=paddle.framework.get_default_dtype(),
-                )
-                seq_len = paddle.static.data("seq_len", [-1], dtype="int64")
-                mask = paddle.static.nn.sequence_lod.sequence_mask(
-                    seq_len, dtype=paddle.get_default_dtype()
-                )
-                if self.time_major:
-                    mask = paddle.transpose(mask, [1, 0])
-                y, h = rnn2(x_data, sequence_length=seq_len)
-                mask = paddle.unsqueeze(mask, -1)
-                y = paddle.multiply(y, mask)
-
-        feed_dict = {x_data.name: x, seq_len.name: sequence_length}
-
-        with paddle.static.scope_guard(scope):
-            y2, h2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h])
-
-        np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5)
-        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
-
     def runTest(self):
         self.test_with_initial_state()
         self.test_with_zero_state()
-        self.test_with_input_lengths()
 
 
 class TestGRU(unittest.TestCase):
@@ -307,46 +266,6 @@ def test_with_zero_state(self):
         np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
 
-    def test_with_input_lengths(self):
-        mp = self.mp.clone()
-        sp = self.sp
-        rnn1 = self.rnn1
-        rnn2 = self.rnn2
-        exe = self.executor
-        scope = self.scope
-
-        x = np.random.randn(12, 4, 16)
-        if not self.time_major:
-            x = np.transpose(x, [1, 0, 2])
-        sequence_length = np.array([12, 10, 9, 8], dtype=np.int64)
-
-        y1, h1 = rnn1(x, sequence_length=sequence_length)
-
-        with paddle.base.unique_name.guard():
-            with paddle.static.program_guard(mp, sp):
-                x_data = paddle.static.data(
-                    "input",
-                    [-1, -1, 16],
-                    dtype=paddle.framework.get_default_dtype(),
-                )
-                seq_len = paddle.static.data("seq_len", [-1], dtype="int64")
-                mask = paddle.static.nn.sequence_lod.sequence_mask(
-                    seq_len, dtype=paddle.get_default_dtype()
-                )
-                if self.time_major:
-                    mask = paddle.transpose(mask, [1, 0])
-                y, h = rnn2(x_data, sequence_length=seq_len)
-                mask = paddle.unsqueeze(mask, -1)
-                y = paddle.multiply(y, mask)
-
-        feed_dict = {x_data.name: x, seq_len.name: sequence_length}
-
-        with paddle.static.scope_guard(scope):
-            y2, h2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h])
-
-        np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5)
-        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
-
     def runTest(self):
         self.test_with_initial_state()
         self.test_with_zero_state()
@@ -476,51 +395,9 @@ def test_with_zero_state(self):
         np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(c1, c2, atol=1e-8, rtol=1e-5)
 
-    def test_with_input_lengths(self):
-        mp = self.mp.clone()
-        sp = self.sp
-        rnn1 = self.rnn1
-        rnn2 = self.rnn2
-        exe = self.executor
-        scope = self.scope
-
-        x = np.random.randn(12, 4, 16)
-        if not self.time_major:
-            x = np.transpose(x, [1, 0, 2])
-        sequence_length = np.array([12, 10, 9, 8], dtype=np.int64)
-
-        y1, (h1, c1) = rnn1(x, sequence_length=sequence_length)
-
-        with paddle.base.unique_name.guard():
-            with paddle.static.program_guard(mp, sp):
-                x_data = paddle.static.data(
-                    "input",
-                    [-1, -1, 16],
-                    dtype=paddle.framework.get_default_dtype(),
-                )
-                seq_len = paddle.static.data("seq_len", [-1], dtype="int64")
-                mask = paddle.static.nn.sequence_lod.sequence_mask(
-                    seq_len, dtype=paddle.get_default_dtype()
-                )
-                if self.time_major:
-                    mask = paddle.transpose(mask, [1, 0])
-                y, (h, c) = rnn2(x_data, sequence_length=seq_len)
-                mask = paddle.unsqueeze(mask, -1)
-                y = paddle.multiply(y, mask)
-
-        feed_dict = {x_data.name: x, seq_len.name: sequence_length}
-
-        with paddle.static.scope_guard(scope):
-            y2, h2, c2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h, c])
-
-        np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5)
-        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
-        np.testing.assert_allclose(c1, c2, atol=1e-8, rtol=1e-5)
-
     def runTest(self):
         self.test_with_initial_state()
         self.test_with_zero_state()
-        self.test_with_input_lengths()
 
 
 class TestLSTMWithProjSize(TestLSTM):
diff --git a/test/deprecated/rnn/test_rnn_nets_static_deprecated.py b/test/deprecated/rnn/test_rnn_nets_static_deprecated.py
new file mode 100644
index 0000000000000..ef58211d65d66
--- /dev/null
+++ b/test/deprecated/rnn/test_rnn_nets_static_deprecated.py
@@ -0,0 +1,372 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+
+paddle.set_default_dtype("float64")
+
+
+paddle.enable_static()
+
+import sys
+import unittest
+
+import numpy as np
+from convert import convert_params_for_net_static
+
+sys.path.append("../../rnn")
+from rnn_numpy import GRU, LSTM, SimpleRNN
+
+bidirectional_list = ["bidirectional", "bidirect"]
+
+
+class TestSimpleRNN(unittest.TestCase):
+    def __init__(
+        self, time_major=True, direction="forward", place="cpu", mode="RNN_TANH"
+    ):
+        super().__init__("runTest")
+        self.time_major = time_major
+        self.direction = direction
+        self.num_directions = 2 if direction in bidirectional_list else 1
+        self.place = place
+        self.mode = mode
+
+    def setUp(self):
+        # Since `set_device` is global, set `set_device` in `setUp` rather than
+        # `__init__` to avoid using an error device set by another test case.
+        place = paddle.set_device(self.place)
+        rnn1 = SimpleRNN(
+            16,
+            32,
+            2,
+            time_major=self.time_major,
+            direction=self.direction,
+            nonlinearity=self.mode,
+        )
+
+        mp = paddle.static.Program()
+        sp = paddle.static.Program()
+        with paddle.base.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                rnn2 = paddle.nn.SimpleRNN(
+                    16,
+                    32,
+                    2,
+                    time_major=self.time_major,
+                    direction=self.direction,
+                    activation=self.mode[4:].lower(),
+                )
+
+        exe = paddle.static.Executor(place)
+        scope = paddle.base.Scope()
+        with paddle.static.scope_guard(scope):
+            exe.run(sp)
+            convert_params_for_net_static(rnn1, rnn2, place)
+
+        self.mp = mp
+        self.sp = sp
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+        self.place = place
+        self.executor = exe
+        self.scope = scope
+
+    def test_with_input_lengths(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        sequence_length = np.array([12, 10, 9, 8], dtype=np.int64)
+
+        y1, h1 = rnn1(x, sequence_length=sequence_length)
+
+        with paddle.base.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.static.data(
+                    "input",
+                    [-1, -1, 16],
+                    dtype=paddle.framework.get_default_dtype(),
+                )
+                seq_len = paddle.static.data("seq_len", [-1], dtype="int64")
+                mask = paddle.static.nn.sequence_lod.sequence_mask(
+                    seq_len, dtype=paddle.get_default_dtype()
+                )
+                if self.time_major:
+                    mask = paddle.transpose(mask, [1, 0])
+                y, h = rnn2(x_data, sequence_length=seq_len)
+                mask = paddle.unsqueeze(mask, -1)
+                y = paddle.multiply(y, mask)
+
+        feed_dict = {x_data.name: x, seq_len.name: sequence_length}
+
+        with paddle.static.scope_guard(scope):
+            y2, h2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h])
+
+        np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_input_lengths()
+
+
+class TestGRU(unittest.TestCase):
+    def __init__(self, time_major=True, direction="forward", place="cpu"):
+        super().__init__("runTest")
+        self.time_major = time_major
+        self.direction = direction
+        self.num_directions = 2 if direction in bidirectional_list else 1
+        self.place = place
+
+    def setUp(self):
+        # Since `set_device` is global, set `set_device` in `setUp` rather than
+        # `__init__` to avoid using an error device set by another test case.
+        place = paddle.set_device(self.place)
+        rnn1 = GRU(
+            16, 32, 2, time_major=self.time_major, direction=self.direction
+        )
+
+        mp = paddle.static.Program()
+        sp = paddle.static.Program()
+        with paddle.base.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                rnn2 = paddle.nn.GRU(
+                    16,
+                    32,
+                    2,
+                    time_major=self.time_major,
+                    direction=self.direction,
+                )
+
+        exe = paddle.static.Executor(place)
+        scope = paddle.base.Scope()
+        with paddle.static.scope_guard(scope):
+            exe.run(sp)
+            convert_params_for_net_static(rnn1, rnn2, place)
+
+        self.mp = mp
+        self.sp = sp
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+        self.place = place
+        self.executor = exe
+        self.scope = scope
+
+    def test_with_input_lengths(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        sequence_length = np.array([12, 10, 9, 8], dtype=np.int64)
+
+        y1, h1 = rnn1(x, sequence_length=sequence_length)
+
+        with paddle.base.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.static.data(
+                    "input",
+                    [-1, -1, 16],
+                    dtype=paddle.framework.get_default_dtype(),
+                )
+                seq_len = paddle.static.data("seq_len", [-1], dtype="int64")
+                mask = paddle.static.nn.sequence_lod.sequence_mask(
+                    seq_len, dtype=paddle.get_default_dtype()
+                )
+                if self.time_major:
+                    mask = paddle.transpose(mask, [1, 0])
+                y, h = rnn2(x_data, sequence_length=seq_len)
+                mask = paddle.unsqueeze(mask, -1)
+                y = paddle.multiply(y, mask)
+
+        feed_dict = {x_data.name: x, seq_len.name: sequence_length}
+
+        with paddle.static.scope_guard(scope):
+            y2, h2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h])
+
+        np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_input_lengths()
+
+
+class TestLSTM(unittest.TestCase):
+    def __init__(self, time_major=True, direction="forward", place="cpu"):
+        super().__init__("runTest")
+        self.time_major = time_major
+        self.direction = direction
+        self.num_directions = 2 if direction in bidirectional_list else 1
+        self.place = place
+
+    def setUp(self):
+        # Since `set_device` is global, set `set_device` in `setUp` rather than
+        # `__init__` to avoid using an error device set by another test case.
+        place = paddle.set_device(self.place)
+        rnn1 = LSTM(
+            16, 32, 2, time_major=self.time_major, direction=self.direction
+        )
+
+        mp = paddle.static.Program()
+        sp = paddle.static.Program()
+        with paddle.base.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                rnn2 = paddle.nn.LSTM(
+                    16,
+                    32,
+                    2,
+                    time_major=self.time_major,
+                    direction=self.direction,
+                )
+
+        exe = paddle.static.Executor(place)
+        scope = paddle.base.Scope()
+        with paddle.static.scope_guard(scope):
+            exe.run(sp)
+            convert_params_for_net_static(rnn1, rnn2, place)
+
+        self.mp = mp
+        self.sp = sp
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+        self.place = place
+        self.executor = exe
+        self.scope = scope
+
+    def test_with_input_lengths(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        sequence_length = np.array([12, 10, 9, 8], dtype=np.int64)
+
+        y1, (h1, c1) = rnn1(x, sequence_length=sequence_length)
+
+        with paddle.base.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.static.data(
+                    "input",
+                    [-1, -1, 16],
+                    dtype=paddle.framework.get_default_dtype(),
+                )
+                seq_len = paddle.static.data("seq_len", [-1], dtype="int64")
+                mask = paddle.static.nn.sequence_lod.sequence_mask(
+                    seq_len, dtype=paddle.get_default_dtype()
+                )
+                if self.time_major:
+                    mask = paddle.transpose(mask, [1, 0])
+                y, (h, c) = rnn2(x_data, sequence_length=seq_len)
+                mask = paddle.unsqueeze(mask, -1)
+                y = paddle.multiply(y, mask)
+
+        feed_dict = {x_data.name: x, seq_len.name: sequence_length}
+
+        with paddle.static.scope_guard(scope):
+            y2, h2, c2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h, c])
+
+        np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(c1, c2, atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_input_lengths()
+
+
+class TestLSTMWithProjSize(TestLSTM):
+    def setUp(self):
+        # Since `set_device` is global, set `set_device` in `setUp` rather than
+        # `__init__` to avoid using an error device set by another test case.
+        place = paddle.set_device(self.place)
+        rnn1 = LSTM(
+            16,
+            32,
+            2,
+            time_major=self.time_major,
+            direction=self.direction,
+            proj_size=8,
+        )
+
+        mp = paddle.static.Program()
+        sp = paddle.static.Program()
+        with paddle.base.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                rnn2 = paddle.nn.LSTM(
+                    16,
+                    32,
+                    2,
+                    time_major=self.time_major,
+                    direction=self.direction,
+                    proj_size=8,
+                )
+
+        exe = paddle.static.Executor(place)
+        scope = paddle.base.Scope()
+        with paddle.static.scope_guard(scope):
+            exe.run(sp)
+            convert_params_for_net_static(rnn1, rnn2, place)
+
+        self.mp = mp
+        self.sp = sp
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+        self.proj_size = 8
+
+        self.place = place
+        self.executor = exe
+        self.scope = scope
+
+
+def load_tests(loader, tests, pattern):
+    suite = unittest.TestSuite()
+    devices = ["cpu", "gpu"] if paddle.base.is_compiled_with_cuda() else ["cpu"]
+    for direction in ["forward", "bidirectional", "bidirect"]:
+        for time_major in [True, False]:
+            for device in devices:
+                for test_class in [
+                    TestSimpleRNN,
+                    TestLSTM,
+                    TestGRU,
+                    TestLSTMWithProjSize,
+                ]:
+                    suite.addTest(test_class(time_major, direction, device))
+                    if test_class == TestSimpleRNN:
+                        suite.addTest(
+                            test_class(
+                                time_major, direction, device, mode="RNN_RELU"
+                            )
+                        )
+    return suite
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/deprecated/standalone_executor/test_standalone_dist_attr_run_time_set_get.py b/test/deprecated/standalone_executor/test_standalone_dist_attr_run_time_set_get_deprecated.py
similarity index 100%
rename from test/deprecated/standalone_executor/test_standalone_dist_attr_run_time_set_get.py
rename to test/deprecated/standalone_executor/test_standalone_dist_attr_run_time_set_get_deprecated.py
diff --git a/test/deprecated/standalone_executor/test_standalone_executor_multi_micro_batch.py b/test/deprecated/standalone_executor/test_standalone_executor_multi_micro_batch_deprecated.py
similarity index 100%
rename from test/deprecated/standalone_executor/test_standalone_executor_multi_micro_batch.py
rename to test/deprecated/standalone_executor/test_standalone_executor_multi_micro_batch_deprecated.py
diff --git a/test/deprecated/standalone_executor/test_standalone_executor_plan.py b/test/deprecated/standalone_executor/test_standalone_executor_plan_deprecated.py
similarity index 100%
rename from test/deprecated/standalone_executor/test_standalone_executor_plan.py
rename to test/deprecated/standalone_executor/test_standalone_executor_plan_deprecated.py
diff --git a/test/deprecated/standalone_executor/test_standalone_op_priority.py b/test/deprecated/standalone_executor/test_standalone_op_priority_deprecated.py
similarity index 100%
rename from test/deprecated/standalone_executor/test_standalone_op_priority.py
rename to test/deprecated/standalone_executor/test_standalone_op_priority_deprecated.py
diff --git a/test/deprecated/standalone_executor/test_standalone_sequentail_run.py b/test/deprecated/standalone_executor/test_standalone_sequentail_run_deprecated.py
similarity index 100%
rename from test/deprecated/standalone_executor/test_standalone_sequentail_run.py
rename to test/deprecated/standalone_executor/test_standalone_sequentail_run_deprecated.py
diff --git a/test/deprecated/tokenizer/CMakeLists.txt b/test/deprecated/tokenizer/CMakeLists.txt
index 1cf384df660b3..cbab1a270c28f 100644
--- a/test/deprecated/tokenizer/CMakeLists.txt
+++ b/test/deprecated/tokenizer/CMakeLists.txt
@@ -8,5 +8,6 @@ foreach(src ${TEST_OPS})
   py_test(${src} SRCS ${src}.py)
 endforeach()
 
-set_tests_properties(test_faster_tokenizer_op PROPERTIES LABELS
-                                                         "RUN_TYPE=EXCLUSIVE")
+set_tests_properties(test_faster_tokenizer_op_deprecated
+                     PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+set_tests_properties(test_faster_tokenizer_op_deprecated PROPERTIES TIMEOUT 120)
diff --git a/test/deprecated/tokenizer/test_faster_tokenizer_op.py b/test/deprecated/tokenizer/test_faster_tokenizer_op_deprecated.py
similarity index 100%
rename from test/deprecated/tokenizer/test_faster_tokenizer_op.py
rename to test/deprecated/tokenizer/test_faster_tokenizer_op_deprecated.py
diff --git a/test/distributed_passes/test_fuse_allreduce_split_to_reducescatter_pass.py b/test/distributed_passes/test_fuse_allreduce_split_to_reducescatter_pass.py
index b36a5121d2e82..5127589c36396 100644
--- a/test/distributed_passes/test_fuse_allreduce_split_to_reducescatter_pass.py
+++ b/test/distributed_passes/test_fuse_allreduce_split_to_reducescatter_pass.py
@@ -22,7 +22,7 @@
     (%38) = "pd_op.data" () {dtype:(pd_op.DataType)bfloat16,name:"linear_0.tmp_0",persistable:[false],place:(pd_op.Place)Place(gpu:0),shape:(pd_op.IntArray)[4096,1,28672],stop_gradient:[false]} : () -> builtin.tensor<4096x1x28672xbf16>
     (%48) = "pd_op.data" () {dtype:(pd_op.DataType)bfloat16,name:"input",persistable:[false],place:(pd_op.Place)Place(gpu:0),shape:(pd_op.IntArray)[4096,1,28672],stop_gradient:[false]} : () -> builtin.tensor<4096x1x28672xbf16>
     (%50) = "pd_op.matmul" (%48, %2) {persistable:[false],stop_gradient:[false],transpose_x:false,transpose_y:true} : (builtin.tensor<4096x1x28672xbf16>, builtin.tensor<8192x28672xbf16>) -> builtin.tensor<4096x1x8192xbf16>
-    (%57) = "pd_op.c_allreduce_sum_" (%50) {persistable:[false],ring_id:(Int32)36,stop_gradient:[false],use_calc_stream:true,use_model_parallel:true} : (builtin.tensor<4096x1x8192xbf16>) -> builtin.tensor<4096x1x8192xbf16>
+    (%57) = "pd_op.c_allreduce_sum_" (%50) {event_to_record:"event_7989",events_to_wait:[],execution_stream:"auto_parallel_mp",force_record_event:false,persistable:[false],ring_id:(Int32)36,stop_gradient:[false],use_calc_stream:true,use_model_parallel:true} : (builtin.tensor<4096x1x8192xbf16>) -> builtin.tensor<4096x1x8192xbf16>
     (%63) = "pd_op.assign" (%57) {persistable:[false],stop_gradient:[false]} : (builtin.tensor<4096x1x8192xbf16>) -> builtin.tensor<4096x1x8192xbf16>
     (%64) = "pd_op.full" () {dtype:(pd_op.DataType)int32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)0} : () -> builtin.tensor<1xi32>
     (%65) = "pd_op.split_with_num" (%63, %64) {num:(Int32)2,persistable:[false],stop_gradient:[false]} : (builtin.tensor<4096x1x8192xbf16>, builtin.tensor<1xi32>) -> vec[builtin.tensor<2048x1x8192xbf16>,builtin.tensor<2048x1x8192xbf16>]
diff --git a/test/deprecated/distributed_passes/test_ps_trainer_pass.py b/test/distributed_passes/test_ps_trainer_pass.py
similarity index 100%
rename from test/deprecated/distributed_passes/test_ps_trainer_pass.py
rename to test/distributed_passes/test_ps_trainer_pass.py
diff --git a/test/distribution/test_distribution_student_t.py b/test/distribution/test_distribution_student_t.py
new file mode 100644
index 0000000000000..900e47cea2428
--- /dev/null
+++ b/test/distribution/test_distribution_student_t.py
@@ -0,0 +1,274 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import parameterize
+import scipy.stats
+from distribution import config
+from parameterize import (
+    TEST_CASE_NAME,
+    parameterize_cls,
+    parameterize_func,
+)
+
+import paddle
+from paddle.distribution.student_t import StudentT
+
+
+@parameterize.place(config.DEVICES)
+@parameterize.parameterize_cls(
+    (parameterize.TEST_CASE_NAME, 'df', 'loc', 'scale'),
+    [
+        (
+            'one-dim',
+            10.0,
+            1.0,
+            2.0,
+        ),
+        (
+            'multi-dim',
+            parameterize.xrand((2, 1), dtype='float32', min=4, max=30),
+            parameterize.xrand((2, 3), dtype='float32', min=1, max=10),
+            parameterize.xrand((2, 3), dtype='float32', min=0.1, max=3),
+        ),
+        (
+            'multi-dim2',
+            parameterize.xrand((2, 1), dtype='float64', min=4, max=30),
+            parameterize.xrand((2, 3), dtype='float64', min=-10, max=-1),
+            parameterize.xrand((2, 3), dtype='float64', min=0.1, max=3),
+        ),
+    ],
+)
+class TestStudentT(unittest.TestCase):
+    def setUp(self):
+        df = (
+            self.df if isinstance(self.df, float) else paddle.to_tensor(self.df)
+        )
+        loc = (
+            self.loc
+            if isinstance(self.loc, float)
+            else paddle.to_tensor(self.loc)
+        )
+        scale = (
+            self.scale
+            if isinstance(self.scale, float)
+            else paddle.to_tensor(self.scale)
+        )
+        self._dist = StudentT(df, loc, scale)
+
+    def test_mean(self):
+        mean = self._dist.mean
+        target_dtype = (
+            "float32" if isinstance(self.df, float) else self.df.dtype
+        )
+        self.assertEqual(mean.numpy().dtype, target_dtype)
+        np.testing.assert_allclose(
+            mean,
+            self._np_mean(),
+            rtol=config.RTOL.get(str(target_dtype)),
+            atol=config.ATOL.get(str(target_dtype)),
+        )
+
+    def test_variance(self):
+        var = self._dist.variance
+        target_dtype = (
+            "float32" if isinstance(self.df, float) else self.df.dtype
+        )
+        self.assertEqual(var.numpy().dtype, target_dtype)
+        np.testing.assert_allclose(
+            var,
+            self._np_variance(),
+            rtol=config.RTOL.get(str(target_dtype)),
+            atol=config.ATOL.get(str(target_dtype)),
+        )
+
+    def test_entropy(self):
+        entropy = self._dist.entropy()
+        target_dtype = (
+            "float32" if isinstance(self.df, float) else self.df.dtype
+        )
+        self.assertEqual(entropy.numpy().dtype, target_dtype)
+        np.testing.assert_allclose(
+            entropy,
+            self._np_entropy(),
+            rtol=config.RTOL.get(str(target_dtype)),
+            atol=config.ATOL.get(str(target_dtype)),
+        )
+
+    def test_sample(self):
+        sample_shape = ()
+        samples = self._dist.sample(sample_shape)
+        self.assertEqual(
+            tuple(samples.shape),
+            sample_shape + self._dist.batch_shape + self._dist.event_shape,
+        )
+
+        sample_shape = (10000,)
+        samples = self._dist.sample(sample_shape)
+        sample_mean = samples.mean(axis=0)
+        sample_variance = samples.var(axis=0)
+
+        # Tolerance value 0.1 is empirical value which is consistent with
+        # TensorFlow
+        np.testing.assert_allclose(
+            sample_mean, self._dist.mean, atol=0, rtol=0.10
+        )
+        # Tolerance value 0.1 is empirical value which is consistent with
+        # TensorFlow
+        np.testing.assert_allclose(
+            sample_variance, self._dist.variance, atol=0, rtol=0.10
+        )
+
+    def _np_variance(self):
+        if isinstance(self.df, np.ndarray) and self.df.dtype == np.float32:
+            df = self.df.astype("float64")
+        else:
+            df = self.df
+        if isinstance(self.loc, np.ndarray) and self.loc.dtype == np.float32:
+            loc = self.loc.astype("float64")
+        else:
+            loc = self.loc
+        if (
+            isinstance(self.scale, np.ndarray)
+            and self.scale.dtype == np.float32
+        ):
+            scale = self.scale.astype("float64")
+        else:
+            scale = self.scale
+        return scipy.stats.t.var(df, loc, scale)
+
+    def _np_mean(self):
+        if isinstance(self.df, np.ndarray) and self.df.dtype == np.float32:
+            df = self.df.astype("float64")
+        else:
+            df = self.df
+        if isinstance(self.loc, np.ndarray) and self.loc.dtype == np.float32:
+            loc = self.loc.astype("float64")
+        else:
+            loc = self.loc
+        if (
+            isinstance(self.scale, np.ndarray)
+            and self.scale.dtype == np.float32
+        ):
+            scale = self.scale.astype("float64")
+        else:
+            scale = self.scale
+        return scipy.stats.t.mean(df, loc, scale)
+
+    def _np_entropy(self):
+        if isinstance(self.df, np.ndarray) and self.df.dtype == np.float32:
+            df = self.df.astype("float64")
+        else:
+            df = self.df
+        if isinstance(self.loc, np.ndarray) and self.loc.dtype == np.float32:
+            loc = self.loc.astype("float64")
+        else:
+            loc = self.loc
+        if (
+            isinstance(self.scale, np.ndarray)
+            and self.scale.dtype == np.float32
+        ):
+            scale = self.scale.astype("float64")
+        else:
+            scale = self.scale
+        return scipy.stats.t.entropy(df, loc, scale)
+
+
+@parameterize.place(config.DEVICES)
+@parameterize.parameterize_cls(
+    (parameterize.TEST_CASE_NAME, 'df', 'loc', 'scale', 'value'),
+    [
+        (
+            'one-dim',
+            10.0,
+            0.0,
+            1.0,
+            np.array(3.3).astype("float32"),
+        ),
+        (
+            'value-broadcast-shape',
+            parameterize.xrand((2, 1), dtype='float64', min=4, max=30),
+            parameterize.xrand((2, 1), dtype='float64', min=-10, max=10),
+            parameterize.xrand((2, 1), dtype='float64', min=0.1, max=5),
+            parameterize.xrand((2, 4), dtype='float64', min=-10, max=10),
+        ),
+    ],
+)
+class TestStudentTProbs(unittest.TestCase):
+    def setUp(self):
+        df = (
+            self.df if isinstance(self.df, float) else paddle.to_tensor(self.df)
+        )
+        loc = (
+            self.loc
+            if isinstance(self.loc, float)
+            else paddle.to_tensor(self.loc)
+        )
+        scale = (
+            self.scale
+            if isinstance(self.scale, float)
+            else paddle.to_tensor(self.scale)
+        )
+        self._dist = StudentT(df, loc, scale)
+
+    def test_prob(self):
+        target_dtype = (
+            "float32" if isinstance(self.df, float) else self.df.dtype
+        )
+        np.testing.assert_allclose(
+            self._dist.prob(paddle.to_tensor(self.value)),
+            scipy.stats.t.pdf(self.value, self.df, self.loc, self.scale),
+            rtol=config.RTOL.get(str(target_dtype)),
+            atol=config.ATOL.get(str(target_dtype)),
+        )
+
+    def test_log_prob(self):
+        target_dtype = (
+            "float32" if isinstance(self.df, float) else self.df.dtype
+        )
+        np.testing.assert_allclose(
+            self._dist.log_prob(paddle.to_tensor(self.value)),
+            scipy.stats.t.logpdf(self.value, self.df, self.loc, self.scale),
+            rtol=config.RTOL.get(str(target_dtype)),
+            atol=config.ATOL.get(str(target_dtype)),
+        )
+
+
+@parameterize.place(config.DEVICES)
+@parameterize_cls([TEST_CASE_NAME], ['StudentTTestError'])
+class StudentTTestError(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static(self.place)
+
+    @parameterize_func(
+        [
+            (-5.0, 0.0, 1.0, ValueError),  # negative df
+            (5.0, 0.0, -1.0, ValueError),  # negative scale
+        ]
+    )
+    def test_bad_parameter(self, df, loc, scale, error):
+        with paddle.base.dygraph.guard(self.place):
+            self.assertRaises(error, StudentT, df, loc, scale)
+
+    @parameterize_func([(10,)])  # not sequence object sample shape
+    def test_bad_sample_shape(self, shape):
+        with paddle.base.dygraph.guard(self.place):
+            t = StudentT(5.0, 0.0, 1.0)
+            self.assertRaises(TypeError, t.sample, shape)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/dygraph_to_static/test_mnist.py b/test/dygraph_to_static/test_mnist.py
index 4c34ae320abad..8f9abe65638c6 100644
--- a/test/dygraph_to_static/test_mnist.py
+++ b/test/dygraph_to_static/test_mnist.py
@@ -26,6 +26,8 @@
 
 import paddle
 from paddle import base
+from paddle.framework import use_pir_api
+from paddle.jit.pir_translated_layer import PIR_INFER_MODEL_SUFFIX
 from paddle.jit.translated_layer import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 from paddle.nn import Linear
 from paddle.optimizer import Adam
@@ -227,16 +229,15 @@ def train(self, to_static=False):
                     prediction, acc, avg_loss = mnist(img, label)
                     loss_data.append(float(avg_loss))
                     # new save load check
-                    # TODO(@xiongkun): enable this after new save load is supported in pir.
-                    if not paddle.framework.use_pir_api():
-                        self.check_jit_save_load(
-                            mnist,
-                            [dy_x_data],
-                            [img, label],
-                            to_static,
-                            prediction,
-                            [img.name],
-                        )
+                    self.check_jit_save_load(
+                        mnist,
+                        [dy_x_data],
+                        [img, label],
+                        to_static,
+                        prediction,
+                        0,
+                        [img.name],
+                    )
                     break
         return loss_data
 
@@ -247,6 +248,7 @@ def check_jit_save_load(
         input_spec,
         to_static,
         gt_out,
+        gt_out_index,
         input_names_after_prune,
     ):
         if to_static:
@@ -255,13 +257,16 @@ def check_jit_save_load(
             )
             model_save_dir = os.path.join(self.temp_dir.name, 'inference')
             model_save_prefix = os.path.join(model_save_dir, 'mnist')
-            model_filename = "mnist" + INFER_MODEL_SUFFIX
+            MODEL_SUFFIX = (
+                PIR_INFER_MODEL_SUFFIX if use_pir_api() else INFER_MODEL_SUFFIX
+            )
+            model_filename = "mnist" + MODEL_SUFFIX
             params_filename = "mnist" + INFER_PARAMS_SUFFIX
             paddle.jit.save(
                 layer=model,
                 path=model_save_prefix,
                 input_spec=input_spec,
-                output_spec=[gt_out],
+                output_spec=[gt_out_index] if use_pir_api() else [gt_out],
                 input_names_after_prune=input_names_after_prune,
             )
             # load in static graph mode
@@ -278,15 +283,16 @@ def check_jit_save_load(
             np.testing.assert_allclose(
                 gt_out.numpy(), dygraph_infer_out, rtol=1e-05
             )
-            # load in Paddle-Inference
-            predictor_infer_out = (
-                self.predictor_load_and_run_inference_analysis(
-                    model_save_dir, model_filename, params_filename, inputs
+            if not use_pir_api():
+                # load in Paddle-Inference
+                predictor_infer_out = (
+                    self.predictor_load_and_run_inference_analysis(
+                        model_save_dir, model_filename, params_filename, inputs
+                    )
+                )
+                np.testing.assert_allclose(
+                    gt_out.numpy(), predictor_infer_out, rtol=1e-05
                 )
-            )
-            np.testing.assert_allclose(
-                gt_out.numpy(), predictor_infer_out, rtol=1e-05
-            )
 
     def jit_load_and_run_inference_static(
         self, model_path, model_filename, params_filename, inputs
diff --git a/test/dygraph_to_static/test_reinforcement_learning.py b/test/dygraph_to_static/test_reinforcement_learning.py
index ade9ba14659d2..fca6e89136353 100644
--- a/test/dygraph_to_static/test_reinforcement_learning.py
+++ b/test/dygraph_to_static/test_reinforcement_learning.py
@@ -16,7 +16,7 @@
 import math
 import unittest
 
-import gym
+import gymnasium as gym
 import numpy as np
 from dygraph_to_static_utils import (
     Dy2StTestBase,
diff --git a/test/dygraph_to_static/test_typehint.py b/test/dygraph_to_static/test_typehint.py
index fd4dbacc6ad6d..b84ce4f332a91 100644
--- a/test/dygraph_to_static/test_typehint.py
+++ b/test/dygraph_to_static/test_typehint.py
@@ -35,15 +35,15 @@ def function(x: A) -> A:
 
 def fn_annotation_assign_with_value(x: paddle.Tensor):
     if x:
-        y: List["paddle.Tensor"] = [x + 1]
+        y: List[paddle.Tensor] = [x + 1]
     else:
-        y: List["paddle.Tensor"] = [x - 1]
+        y: List[paddle.Tensor] = [x - 1]
     return y
 
 
 def fn_annotation_assign_without_value(x: paddle.Tensor):
     if x:
-        y: List["paddle.Tensor"]
+        y: List[paddle.Tensor]
         y = [x + 1]
     else:
         y = [x - 1]
diff --git a/test/deprecated/fft/test_spectral_op.py b/test/fft/test_spectral_op.py
similarity index 99%
rename from test/deprecated/fft/test_spectral_op.py
rename to test/fft/test_spectral_op.py
index 2596fb13eab1c..94168193f468d 100644
--- a/test/deprecated/fft/test_spectral_op.py
+++ b/test/fft/test_spectral_op.py
@@ -14,6 +14,7 @@
 
 import re
 import sys
+import unittest
 
 import numpy as np
 from op_test import OpTest
@@ -311,3 +312,7 @@ def test_check_grad(self):
             ["X"],
             "Out",
         )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/ipu/distributed/run_dist_ipu.sh b/test/ipu/distributed/run_dist_ipu.sh
index 1ab804e626c63..e7deb58c28750 100644
--- a/test/ipu/distributed/run_dist_ipu.sh
+++ b/test/ipu/distributed/run_dist_ipu.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
-  
+
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/test/ir/inference/quant_dequant_test.py b/test/ir/inference/quant_dequant_test.py
index c176e802a525c..85724a2cc7df2 100644
--- a/test/ir/inference/quant_dequant_test.py
+++ b/test/ir/inference/quant_dequant_test.py
@@ -22,9 +22,10 @@
 
 import paddle
 from paddle import base
-from paddle.base import Program, Variable, core
+from paddle.base import core
 from paddle.base.core import AnalysisConfig, create_paddle_predictor
 from paddle.base.framework import IrGraph
+from paddle.static import Variable
 from paddle.static.io import append_fetch_ops, prepend_feed_ops
 from paddle.static.quantization import (
     AddQuantDequantPass,
@@ -39,10 +40,10 @@ class QuantDequantTest(unittest.TestCase):
     def __init__(self, methodName='runTest'):
         super().__init__(methodName)
         paddle.enable_static()
-        self.main_program = base.Program()
-        self.startup_program = base.Program()
-        self.test_main_program = base.Program()
-        self.test_startup_program = base.Program()
+        self.main_program = paddle.static.Program()
+        self.startup_program = paddle.static.Program()
+        self.test_main_program = paddle.static.Program()
+        self.test_startup_program = paddle.static.Program()
         self.feeds = None
         self.fetch_list = None
         self.enable_mkldnn = False
@@ -62,10 +63,9 @@ def __init__(self, methodName='runTest'):
 
     # from Paddle release2.1
     def _normalize_program(self, program, feed_vars, fetch_vars):
-        if not isinstance(program, Program):
+        if not isinstance(program, paddle.static.Program):
             raise TypeError(
-                "program type must be `base.Program`, but received `%s`"
-                % type(program)
+                f"program type must be `paddle.static.Program`, but received `{type(program)}`"
             )
         if not isinstance(feed_vars, list):
             feed_vars = [feed_vars]
@@ -127,7 +127,7 @@ def _save_models(
             if var.name in feeded_var_names:
                 feeded_vars.append(var)
 
-        with base.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             paddle.static.io.save_inference_model(
                 dirname,
                 feeded_vars,
@@ -155,7 +155,7 @@ def _get_paddle_outs(self, feed, fetch_list, executor, program, scope):
         '''
         Return PaddlePaddle outputs.
         '''
-        with base.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             outs = executor.run(
                 program=program,
                 feed=feed,
@@ -245,12 +245,12 @@ def check_output_with_option(
         or disable TensorRT, enable MKLDNN or disable MKLDNN
         are all the same.
         '''
-        place = base.CUDAPlace(0) if use_gpu else base.CPUPlace()
-        executor = base.Executor(place)
-        scope = base.Scope()
+        place = paddle.CUDAPlace(0) if use_gpu else paddle.CPUPlace()
+        executor = paddle.static.Executor(place)
+        scope = paddle.static.Scope()
         device = "GPU" if use_gpu else "CPU"
 
-        with base.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             executor.run(self.startup_program)
             executor.run(self.test_startup_program)
         main_graph = IrGraph(core.Graph(self.main_program.desc), for_test=False)
@@ -274,11 +274,11 @@ def check_output_with_option(
         scale_training_pass = OutScaleForTrainingPass(scope=scope, place=place)
         scale_training_pass.apply(main_graph)
 
-        build_strategy = base.BuildStrategy()
+        build_strategy = paddle.static.BuildStrategy()
         build_strategy.memory_optimize = False
         build_strategy.enable_inplace = False
         build_strategy.fuse_all_reduce_ops = False
-        binary = base.CompiledProgram(main_graph.graph)
+        binary = paddle.static.CompiledProgram(main_graph.graph)
 
         iters = 10
         batch_size = 1
@@ -287,7 +287,7 @@ def check_output_with_option(
             batch_size=batch_size,
         )
         feeder = base.DataFeeder(feed_list=[self.data, self.label], place=place)
-        with base.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             for _ in range(iters):
                 data = next(train_reader())
                 loss_v = executor.run(
@@ -307,7 +307,7 @@ def check_output_with_option(
 
         self.main_program = test_graph.to_program()
 
-        with base.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             self.main_program = self._normalize_program(
                 self.main_program, self.data, self.fetch_list
             )
@@ -450,6 +450,6 @@ def __init__(
             self.disable_trt_plugin_fp16 = disable_trt_plugin_fp16
 
     def quant_dequant(self):
-        place = base.CPUPlace()
-        exe = base.Executor(place)
-        scope = base.Scope()
+        place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        scope = paddle.static.Scope()
diff --git a/test/ir/pir/CMakeLists.txt b/test/ir/pir/CMakeLists.txt
index e80898846c557..29df19c523d88 100644
--- a/test/ir/pir/CMakeLists.txt
+++ b/test/ir/pir/CMakeLists.txt
@@ -42,3 +42,4 @@ py_test_modules(
   FLAGS_pir_subgraph_saving_dir=${CMAKE_CURRENT_SOURCE_DIR})
 
 add_subdirectory(fused_pass)
+add_subdirectory(translator)
diff --git a/test/ir/pir/cinn/CMakeLists.txt b/test/ir/pir/cinn/CMakeLists.txt
index a8d99e7170654..6261e22868264 100644
--- a/test/ir/pir/cinn/CMakeLists.txt
+++ b/test/ir/pir/cinn/CMakeLists.txt
@@ -34,8 +34,8 @@ if(WITH_GPU)
       PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH}
       FLAGS_enable_pir_api=1 FLAGS_prim_all=True
       FLAGS_cinn_new_group_scheduler=1 FLAGS_cinn_bucket_compile=1
-      FLAGS_support_reduce_stride_read=1 FLAGS_group_schedule_tiling_first=1
-      ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test_cinn_sub_graph.py
+      FLAGS_group_schedule_tiling_first=1 ${PYTHON_EXECUTABLE}
+      ${CMAKE_CURRENT_SOURCE_DIR}/test_cinn_sub_graph.py
     WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
   set_tests_properties(test_cinn_sub_graph_stride_read
                        PROPERTIES LABELS "RUN_TYPE=CINN")
diff --git a/test/ir/pir/cinn/performance/CMakeLists.txt b/test/ir/pir/cinn/performance/CMakeLists.txt
index 9bbb186614eb6..a8145d0c4083d 100644
--- a/test/ir/pir/cinn/performance/CMakeLists.txt
+++ b/test/ir/pir/cinn/performance/CMakeLists.txt
@@ -20,21 +20,6 @@ if(WITH_GPU)
       WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
     set_tests_properties(${cinn_pir_test_name} PROPERTIES LABELS
                                                           "RUN_TYPE=CINN")
-
-    add_test(
-      NAME ${cinn_pir_test_name}_stride_read
-      COMMAND
-        ${CMAKE_COMMAND} -E env
-        PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH}
-        FLAGS_check_infer_symbolic=1 FLAGS_enable_pir_api=1
-        FLAGS_cinn_bucket_compile=True FLAGS_prim_enable_dynamic=true
-        FLAGS_pir_apply_shape_optimization_pass=1
-        FLAGS_group_schedule_tiling_first=1 FLAGS_cinn_new_group_scheduler=1
-        FLAGS_support_reduce_stride_read=1 ${PYTHON_EXECUTABLE}
-        ${CMAKE_CURRENT_SOURCE_DIR}/${cinn_pir_test_name}.py
-      WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
-    set_tests_properties(${cinn_pir_test_name}_stride_read
-                         PROPERTIES LABELS "RUN_TYPE=CINN")
   endforeach()
 
 endif()
diff --git a/test/ir/pir/cinn/sub_graphs/base.py b/test/ir/pir/cinn/sub_graphs/base.py
index a11ffe4f9e1bd..a0ceee03095db 100644
--- a/test/ir/pir/cinn/sub_graphs/base.py
+++ b/test/ir/pir/cinn/sub_graphs/base.py
@@ -30,7 +30,7 @@ def setUp(self):
         self.atol = 1e-6
         self.train_atol = 1e-6
         self.with_precision_compare = True
-        self.with_train = False  # 本个pr中默认为false，下个增量pr中改为默认true
+        self.with_train = True  # 本个pr中默认为false，下个增量pr中改为默认true
         # override customized settting
         self.init()
         if self.inputs:
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_0.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_0.py
index e5d86d0e40f53..228465812c587 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_0.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_0.py
@@ -135,7 +135,6 @@ def init(self):
             paddle.rand(shape=[22, 512, 7, 7], dtype=paddle.float32),
         )
         self.net = LayerCase
-        self.with_train = True
 
     def set_flags(self):
         # NOTE(Aurelius84): cinn_op.pool2d only support pool_type='avg' under adaptive=True
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_1.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_1.py
index 10ed97211646c..d40e635bca9ed 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_1.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_1.py
@@ -62,6 +62,7 @@ def init(self):
             paddle.rand(shape=[10, 512, 7, 7], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_10.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_10.py
index c151d478a6ac6..b871017d1e038 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_10.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_10.py
@@ -75,6 +75,7 @@ def init(self):
             paddle.rand(shape=[10, 36, 28, 28], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_11.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_11.py
index 464ab6166a0fa..83fd4bff996bc 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_11.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_11.py
@@ -65,6 +65,7 @@ def init(self):
             paddle.rand(shape=[10, 1280, 1, 1], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_13.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_13.py
index 24d79ccfc8e94..dd91f88558b59 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_13.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_13.py
@@ -60,6 +60,7 @@ def init(self):
             paddle.rand(shape=[10, 2048, 7, 7], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_14.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_14.py
index 167b10dd6df2f..7708b6fb6c2bb 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_14.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_14.py
@@ -72,6 +72,7 @@ def init(self):
             paddle.rand(shape=[22, 128, 56, 56], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_15.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_15.py
index c5050e5cb9d55..4d1ac693615d3 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_15.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_15.py
@@ -72,6 +72,7 @@ def init(self):
             paddle.rand(shape=[10, 122, 28, 28], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_16.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_16.py
index 5fad58c5de16b..3e6696a5f23c9 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_16.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_16.py
@@ -115,6 +115,7 @@ def init(self):
             paddle.rand(shape=[22, 28, 56, 56], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
     def set_flags(self):
         # NOTE(Aurelius84): cinn_op.pool2d only support pool_type='avg' under adaptive=True
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_17.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_17.py
index 5dc0d861cc847..62ef8a2dbe38c 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_17.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_17.py
@@ -60,6 +60,7 @@ def init(self):
             paddle.rand(shape=[22, 2048, 7, 7], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_18.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_18.py
index b4010043304be..e8f4772b757a5 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_18.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_18.py
@@ -68,6 +68,7 @@ def init(self):
             paddle.rand(shape=[22, 1536, 8, 8], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
         self.with_precision_compare = False
 
     # NOTE output mismatch with prim
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_2.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_2.py
index d3faccc973b03..883067279e417 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_2.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_2.py
@@ -74,7 +74,6 @@ def init(self):
             paddle.rand(shape=[43, 256, 56, 56], dtype=paddle.float32),
         )
         self.net = LayerCase
-        self.with_train = True
 
     def set_flags(self):
         # NOTE(Aurelius84): cinn_op.pool2d only support pool_type='avg' under adaptive=True
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_20.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_20.py
index 57dcec3e56353..82523d9dd29e4 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_20.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_20.py
@@ -77,6 +77,7 @@ def init(self):
             paddle.rand(shape=[86, 192], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_21.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_21.py
index 49eea1bd4cbfd..b19151557a65a 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_21.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_21.py
@@ -108,6 +108,7 @@ def init(self):
         ]
         self.inputs = (paddle.rand(shape=[86, 198, 192], dtype=paddle.float32),)
         self.net = LayerCase
+        self.with_train = False
 
     # NOTE output mismatch with prim
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_23.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_23.py
index 83ddc2b51b2b8..b37c912b61f5d 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_23.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_23.py
@@ -60,6 +60,7 @@ def init(self):
             paddle.rand(shape=[11, 24, 56, 56], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_25.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_25.py
index b434f440365f6..d6be0ea181c59 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_25.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_25.py
@@ -68,6 +68,7 @@ def init(self):
             paddle.rand(shape=[11, 1280, 7, 7], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
         self.with_precision_compare = False
 
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_28.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_28.py
index 6a25c112a0b47..5387f9ee37177 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_28.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_28.py
@@ -68,6 +68,7 @@ def init(self):
             paddle.rand(shape=[10, 320, 8, 8], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
         self.with_precision_compare = False
 
     # NOTE prim + cinn lead to error
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_29.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_29.py
index 85b2207fd1ee1..9283f453e46ae 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_29.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_29.py
@@ -68,6 +68,7 @@ def init(self):
             paddle.rand(shape=[10, 2048, 10, 10], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
         self.with_precision_compare = False
 
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_3.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_3.py
index 23b9ec755c7be..9c538dea0d694 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_3.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_3.py
@@ -89,6 +89,7 @@ def init(self):
             paddle.randint(low=0, high=10, shape=[16, 49], dtype=paddle.int64),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_31.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_31.py
index 81d18df09b741..eee47cf931cd9 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_31.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_31.py
@@ -66,6 +66,7 @@ def init(self):
             paddle.rand(shape=[22, 288, 14, 14], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
         self.atol = 1e-8
 
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_32.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_32.py
index 7586bd7c8cd37..2bed2bfc9a742 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_32.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_32.py
@@ -54,6 +54,7 @@ def init(self):
             paddle.rand(shape=[22, 1024, 1, 1], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_33.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_33.py
index 0d50f420cdc22..55b168f5e2ade 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_33.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_33.py
@@ -84,6 +84,7 @@ def init(self):
             paddle.rand(shape=[10, 256, 14, 14], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
         self.atol = 1e-5
 
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_34.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_34.py
index 7466135585abd..a8d09423a95eb 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_34.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_34.py
@@ -57,6 +57,7 @@ def init(self):
             paddle.rand(shape=[10, 32, 56, 56], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
         self.atol = 1e-8
 
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_35.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_35.py
index 7eb05d010bd2f..8c70aa1f75ae2 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_35.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_35.py
@@ -84,6 +84,7 @@ def init(self):
             paddle.rand(shape=[4, 3, 384, 384], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_36.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_36.py
index 03f141b241bdc..6abd8655d98f6 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_36.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_36.py
@@ -70,6 +70,7 @@ def init(self):
         ]
         self.inputs = (paddle.rand(shape=[6, 9216, 96], dtype=paddle.float32),)
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_38.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_38.py
index 431650d6bdbef..828f15fa32c3b 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_38.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_38.py
@@ -48,6 +48,7 @@ def init(self):
             paddle.rand(shape=[4, 48, 96, 96], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_39.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_39.py
index ddd3cdf8c3eda..44431cb437d82 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_39.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_39.py
@@ -46,6 +46,7 @@ def init(self):
         ]
         self.inputs = (paddle.rand(shape=[12, 288, 192], dtype=paddle.float32),)
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_4.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_4.py
index 9d419dbb38959..f03c8322cce70 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_4.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_4.py
@@ -51,6 +51,7 @@ def init(self):
         ]
         self.inputs = (paddle.rand(shape=[22, 196, 128], dtype=paddle.float32),)
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_41.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_41.py
index 352f81b791d41..d3d09e75e4f70 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_41.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_41.py
@@ -66,6 +66,7 @@ def init(self):
             paddle.randint(low=0, high=10, shape=[2], dtype=paddle.int32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_42.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_42.py
index 0e8a6574081a4..60d3846377987 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_42.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_42.py
@@ -114,6 +114,7 @@ def init(self):
             paddle.rand(shape=[2, 4], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
     def set_flags(self):
         # NOTE(Aurelius84): cinn_op.pool2d only support pool_type='avg' under adaptive=True
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_43.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_43.py
index 0104a18d75d60..9440b6cb9dbd5 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_43.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_43.py
@@ -258,6 +258,7 @@ def init(self):
             paddle.rand(shape=[1, 2048, 24, 36], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
         self.atol = 1e-5
 
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_44.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_44.py
index 06c021953fd1e..34416aea9ae97 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_44.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_44.py
@@ -143,6 +143,7 @@ def init(self):
             paddle.rand(shape=[1, 100, 256], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
         self.atol = 1e-8
         self.with_cinn = False
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_45.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_45.py
index 8c9802242f436..d2f6befdc9147 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_45.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_45.py
@@ -70,6 +70,7 @@ def init(self):
         ]
         self.inputs = (paddle.rand(shape=[1, 4], dtype=paddle.float32),)
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_46.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_46.py
index 6e45b88c332da..19ec352bcf5d4 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_46.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_46.py
@@ -62,6 +62,7 @@ def init(self):
             paddle.rand(shape=[1, 80, 50, 50], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
     # NOTE prim + cinn lead to error
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_47.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_47.py
index 72599e85f742f..5096d5f366b63 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_47.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_47.py
@@ -47,6 +47,7 @@ def init(self):
             paddle.randint(low=0, high=10, shape=[2], dtype=paddle.int64),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_48.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_48.py
index eaa9d3e6b9232..7fc4b64f1466f 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_48.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_48.py
@@ -190,6 +190,7 @@ def init(self):
             paddle.rand(shape=[1, 625, 1], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
         self.atol = 1e-5
 
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_49.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_49.py
index 34ecd19552529..4367e45015b23 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_49.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_49.py
@@ -66,6 +66,7 @@ def init(self):
             paddle.rand(shape=[1], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_5.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_5.py
index 7c9639d906cda..181d06fffb4c3 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_5.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_5.py
@@ -46,6 +46,7 @@ def init(self):
         ]
         self.inputs = (paddle.rand(shape=[22, 16, 384], dtype=paddle.float32),)
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_51.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_51.py
index 10ab5da982012..152dc5b2ce483 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_51.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_51.py
@@ -90,6 +90,7 @@ def init(self):
             paddle.rand(shape=[1, 4, 64, 64], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
     # NOTE prim + cinn lead to error
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_52.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_52.py
index ed08605e070d1..e1a3774b1be35 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_52.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_52.py
@@ -94,6 +94,7 @@ def init(self):
             paddle.rand(shape=[91], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_54.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_54.py
index cf04f914d15a9..7bdef30c7d243 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_54.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_54.py
@@ -117,6 +117,7 @@ def init(self):
             paddle.rand(shape=[1, 96, 128, 128], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_55.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_55.py
index 7d065da0bc99b..9a623a7afa130 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_55.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_55.py
@@ -78,6 +78,7 @@ def init(self):
             paddle.rand(shape=[1, 192, 32, 32], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
     # NOTE prim + cinn lead to error
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_56.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_56.py
index 79d9a9c15cf9e..4646923191e60 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_56.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_56.py
@@ -74,6 +74,7 @@ def init(self):
             paddle.rand(shape=[24], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
     # NOTE prim + cinn lead to error
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_57.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_57.py
index a34e30dc687e2..d297a19fa0932 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_57.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_57.py
@@ -42,6 +42,7 @@ def init(self):
         self.input_specs = []
         self.inputs = ()
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_59.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_59.py
index 12dc85dbf3d3f..072c8077b7295 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_59.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_59.py
@@ -95,6 +95,7 @@ def init(self):
             paddle.rand(shape=[1, 44, 32, 32], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
     # NOTE prim + cinn lead to error
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_6.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_6.py
index f51b3a846151d..89a1c19ed53a7 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_6.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_6.py
@@ -47,6 +47,7 @@ def init(self):
         ]
         self.inputs = (paddle.rand(shape=[10, 196, 640], dtype=paddle.float32),)
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_61.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_61.py
index 21332c862ab22..41be02a221bd4 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_61.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_61.py
@@ -91,6 +91,7 @@ def init(self):
             paddle.rand(shape=[1, 4], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
         self.atol = 1e-5
 
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_62.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_62.py
index d4a2234509d1c..dd6069d9f9555 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_62.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_62.py
@@ -71,6 +71,7 @@ def init(self):
             paddle.rand(shape=[1], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
         self.atol = 1e-8
 
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_63.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_63.py
index 5456431c96fea..6a6f430bd82be 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_63.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_63.py
@@ -96,6 +96,7 @@ def init(self):
             paddle.rand(shape=[171888, 4], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
     def set_flags(self):
         # NOTE(Aurelius84): cinn_op.pool2d only support pool_type='avg' under adaptive=True
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_64.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_64.py
index 9ec76729c00e0..820f7af48178e 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_64.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_64.py
@@ -72,6 +72,7 @@ def init(self):
             paddle.rand(shape=[512, 256, 7, 7], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_65.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_65.py
index 18af525df5c4c..e7e636628d5f1 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_65.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_65.py
@@ -55,6 +55,7 @@ def init(self):
         ]
         self.inputs = (paddle.rand(shape=[2, 2002], dtype=paddle.float32),)
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_66.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_66.py
index 1c3d72c455056..033202891b2ed 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_66.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_66.py
@@ -64,6 +64,7 @@ def init(self):
         ]
         self.inputs = (paddle.rand(shape=[2, 1788], dtype=paddle.float32),)
         self.net = LayerCase
+        self.with_train = False
 
     # NOTE prim + cinn lead to error
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_67.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_67.py
index 75fb8ca7cfb38..74513aac91b5b 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_67.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_67.py
@@ -134,6 +134,7 @@ def init(self):
             paddle.randint(low=0, high=10, shape=[1], dtype=paddle.int32),
         )
         self.net = LayerCase
+        self.with_train = False
         self.with_cinn = False
 
     # NOTE prim + cinn lead to error
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_68.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_68.py
index d3571d898798f..67df4b8fba497 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_68.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_68.py
@@ -206,6 +206,7 @@ def init(self):
             paddle.rand(shape=[528, 4], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_69.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_69.py
index c1c4b94929310..4e64e3aea0bbc 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_69.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_69.py
@@ -65,6 +65,7 @@ def init(self):
             paddle.rand(shape=[1, 171888, 4], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
         self.with_precision_compare = False
 
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_7.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_7.py
index f4236d7664c59..bdc2d7b052c77 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_7.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_7.py
@@ -91,6 +91,7 @@ def init(self):
             paddle.randint(low=0, high=10, shape=[49, 49], dtype=paddle.int64),
         )
         self.net = LayerCase
+        self.with_train = False
         self.with_cinn = False
 
     # NOTE prim + cinn lead to error
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_70.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_70.py
index 30b04988e601f..a483c47e1e05f 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_70.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_70.py
@@ -61,6 +61,7 @@ def init(self):
             paddle.randint(low=0, high=10, shape=[2], dtype=paddle.int32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_72.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_72.py
index ff048a21337da..489eab05cf04e 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_72.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_72.py
@@ -143,6 +143,7 @@ def init(self):
         self.input_specs = []
         self.inputs = ()
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_73.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_73.py
index ea4a9cd49726d..a75d51a21cd1e 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_73.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_73.py
@@ -98,6 +98,7 @@ def init(self):
             paddle.rand(shape=[2], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
         self.with_cinn = False
 
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_74.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_74.py
index a069b9bc3874b..03fcab9ff9f00 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_74.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_74.py
@@ -75,6 +75,7 @@ def init(self):
             paddle.rand(shape=[2], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
         self.with_precision_compare = False
 
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_75.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_75.py
index 41204b7c15d2e..a20fbaf33e4e7 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_75.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_75.py
@@ -96,6 +96,7 @@ def init(self):
             paddle.rand(shape=[1, 3, 544, 736], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_77.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_77.py
index bb22fb38c693a..4ad52c6aa976c 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_77.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_77.py
@@ -209,6 +209,7 @@ def init(self):
             paddle.rand(shape=[1], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_78.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_78.py
index af4320f4609ef..f987f5a334ca6 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_78.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_78.py
@@ -125,6 +125,7 @@ def init(self):
             paddle.rand(shape=[1, 256, 13, 19], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_79.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_79.py
index 96d9de9b9c2b6..1bf2af665a2e2 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_79.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_79.py
@@ -134,6 +134,7 @@ def init(self):
             paddle.rand(shape=[1, 3, 96, 96, 1], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_8.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_8.py
index 6340bf5a4d451..656e522137b4b 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_8.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_8.py
@@ -47,6 +47,7 @@ def init(self):
             paddle.rand(shape=[22, 128, 14, 14], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_80.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_80.py
index 2fe8b3f007e86..4a34d06b5b4af 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_80.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_80.py
@@ -125,6 +125,7 @@ def init(self):
             paddle.rand(shape=[1, 3, 48, 48, 1], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_81.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_81.py
index dc0d1e5126259..acbe1eae0ae60 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_81.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_81.py
@@ -80,6 +80,7 @@ def init(self):
             paddle.rand(shape=[1, 80, 44, 44], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
     # NOTE prim + cinn lead to error
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_82.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_82.py
index 65ab9b68b7b6d..9761629a802e3 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_82.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_82.py
@@ -173,6 +173,7 @@ def init(self):
             paddle.rand(shape=[2541, 2], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
         self.with_cinn = False
 
     # NOTE cinn lead to error
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_83.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_83.py
index 2a1a527317b91..889e5b0e9dfde 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_83.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_83.py
@@ -96,6 +96,7 @@ def init(self):
         self.input_specs = []
         self.inputs = ()
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_84.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_84.py
index 595163ad073e1..a20bac9133a8f 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_84.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_84.py
@@ -81,6 +81,7 @@ def init(self):
             paddle.rand(shape=[1, 2541, 68], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_85.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_85.py
index 9ef4bf92bc473..80137072f1c23 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_85.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_85.py
@@ -61,6 +61,7 @@ def init(self):
             paddle.rand(shape=[16384, 5], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_86.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_86.py
index 698760309d8ff..47221f58d3ca3 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_86.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_86.py
@@ -247,6 +247,7 @@ def init(self):
             paddle.rand(shape=[1, 2048, 1, 1], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_87.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_87.py
index b44fdc4c28783..4e23ab81535de 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_87.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_87.py
@@ -201,6 +201,7 @@ def init(self):
             paddle.rand(shape=[1, 144, 21, 32], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_88.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_88.py
index 425537e634f25..0ed66f4e89e8d 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_88.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_88.py
@@ -79,6 +79,7 @@ def init(self):
             paddle.randint(low=0, high=10, shape=[1, 500], dtype=paddle.int32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_89.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_89.py
index ab1503ef63afa..21faaf7dcad30 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_89.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_89.py
@@ -91,6 +91,7 @@ def init(self):
             paddle.rand(shape=[1, 256, 28, 40], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 # if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_9.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_9.py
index e8919aec6e379..7dd68051a5efa 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_9.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_9.py
@@ -90,6 +90,7 @@ def init(self):
             paddle.randint(low=0, high=10, shape=[49, 196], dtype=paddle.int64),
         )
         self.net = LayerCase
+        self.with_train = False
         self.with_cinn = False
 
     # NOTE prim + cinn lead to error
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_90.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_90.py
index e3f28f9775a69..85f937d265d5b 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_90.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_90.py
@@ -65,6 +65,7 @@ def init(self):
         ]
         self.inputs = (paddle.rand(shape=[12], dtype=paddle.float32),)
         self.net = LayerCase
+        self.with_train = False
 
     def set_flags(self):
         # NOTE(Aurelius84): cinn_op.pool2d only support pool_type='avg' under adaptive=True
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_adaptive_avg_pool2d.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_adaptive_avg_pool2d.py
index d4d06895c49ae..1a166fad740a7 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_adaptive_avg_pool2d.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_adaptive_avg_pool2d.py
@@ -48,6 +48,7 @@ def init(self):
             paddle.rand(shape=[22, 480, 7, 7], dtype=paddle.float32),
         )
         self.net = AdaptiveAvgPool2dCase
+        self.with_train = False
 
     # NOTE prim + cinn lead to error
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_add.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_add.py
index c9cf656ad4a0c..9434d1c189373 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_add.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_add.py
@@ -54,6 +54,7 @@ def init(self):
             paddle.rand(shape=[22, 196, 128], dtype=paddle.float32),
         )
         self.net = AddCase
+        self.with_train = False
         self.atol = 1e-8
 
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_add_n.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_add_n.py
index c488de14d12be..18cf5c72f2a50 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_add_n.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_add_n.py
@@ -104,6 +104,7 @@ def init(self):
             paddle.rand(shape=[1], dtype=paddle.float32),
         )
         self.net = AddNCase
+        self.with_train = False
         self.atol = 1e-8
 
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_avg_pool2d.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_avg_pool2d.py
index 0a40ca5079931..957102539eb07 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_avg_pool2d.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_avg_pool2d.py
@@ -56,6 +56,7 @@ def init(self):
             paddle.rand(shape=[22, 128, 56, 56], dtype=paddle.float32),
         )
         self.net = AvgPool2dCase
+        self.with_train = False
         self.atol = 1e-8
         self.with_cinn = False
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_chunk.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_chunk.py
index 36dae471d0d7d..35e12f767dae7 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_chunk.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_chunk.py
@@ -46,6 +46,7 @@ def init(self):
             paddle.rand(shape=[10, 2304, 192], dtype=paddle.float32),
         )
         self.net = ChunkCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_concat.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_concat.py
index f65682e4b0ae9..b298c0870d4bc 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_concat.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_concat.py
@@ -54,6 +54,7 @@ def init(self):
             paddle.rand(shape=[145, 12, 112, 112], dtype=paddle.float32),
         )
         self.net = ConcatCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_conv_nd.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_conv_nd.py
index c189750c9f040..5bdd5b1622a34 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_conv_nd.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_conv_nd.py
@@ -63,6 +63,7 @@ def init(self):
             paddle.rand(shape=[22, 64, 56, 56], dtype=paddle.float32),
         )
         self.net = ConvNdCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_linear.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_linear.py
index 381eb461b6328..c4a358ad4b0bf 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_linear.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_linear.py
@@ -54,6 +54,7 @@ def init(self):
         ]
         self.inputs = (paddle.rand(shape=[10, 64], dtype=paddle.float32),)
         self.net = LinearCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_max_pool2d.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_max_pool2d.py
index 5cd643fc5ef4a..96d2bd54868d1 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_max_pool2d.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_max_pool2d.py
@@ -55,6 +55,7 @@ def init(self):
             paddle.rand(shape=[22, 64, 112, 112], dtype=paddle.float32),
         )
         self.net = MaxPool2dCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_mul_method.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_mul_method.py
index 1e56b482d3736..fa389063a0513 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_mul_method.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_mul_method.py
@@ -54,6 +54,7 @@ def init(self):
             paddle.rand(shape=[22, 1500, 14, 14], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
         self.atol = 1e-8
 
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_relu6.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_relu6.py
index f628bc19cc9aa..f267c1610f665 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_relu6.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_relu6.py
@@ -47,6 +47,7 @@ def init(self):
             paddle.rand(shape=[22, 144, 56, 56], dtype=paddle.float32),
         )
         self.net = Relu6Case
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_reshape.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_reshape.py
index 5abaff9157d1d..540958310b7cc 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_reshape.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_reshape.py
@@ -44,6 +44,7 @@ def init(self):
         ]
         self.inputs = (paddle.rand(shape=[4312, 640], dtype=paddle.float32),)
         self.net = ReshapeCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_sigmoid.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_sigmoid.py
index 3f77a5c68a93a..a746f3cdd41bc 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_sigmoid.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_sigmoid.py
@@ -46,6 +46,7 @@ def init(self):
             paddle.rand(shape=[10, 512, 1, 1], dtype=paddle.float32),
         )
         self.net = SigmoidCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_split.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_split.py
index b82ec109ca724..57de6d8cb09c0 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_split.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_split.py
@@ -48,6 +48,7 @@ def init(self):
             paddle.rand(shape=[11, 976, 7, 7], dtype=paddle.float32),
         )
         self.net = SplitCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_squeeze_unsqueeze.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_squeeze_unsqueeze.py
index 516d6c6735ff6..4f7438c8a00eb 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_squeeze_unsqueeze.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_squeeze_unsqueeze.py
@@ -51,6 +51,7 @@ def init(self):
         ]
         self.inputs = (paddle.rand(shape=[1, 12, 1, 64], dtype=paddle.float32),)
         self.net = SqueezeCase
+        self.with_train = False
         self.atol = 1e-8
 
 
@@ -66,6 +67,7 @@ def init(self):
         ]
         self.inputs = (paddle.rand(shape=[1, 12, 1, 64], dtype=paddle.float32),)
         self.net = UnsqueezeCase
+        self.with_train = False
         self.atol = 1e-8
 
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_swish.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_swish.py
index 1f7402d0470ed..da572f47bfd94 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_swish.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_swish.py
@@ -46,6 +46,7 @@ def init(self):
             paddle.rand(shape=[43, 32, 112, 112], dtype=paddle.float32),
         )
         self.net = SwishCase
+        self.with_train = False
         self.atol = 1e-8
 
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_transpose.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_transpose.py
index 49a05607e3ae3..51db880532187 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_transpose.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_transpose.py
@@ -46,6 +46,7 @@ def init(self):
             paddle.rand(shape=[22, 4, 224, 224], dtype=paddle.float32),
         )
         self.net = TransposeCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/symbolic/test_dyshape_group_norm.py b/test/ir/pir/cinn/symbolic/test_dyshape_group_norm.py
new file mode 100644
index 0000000000000..a3e9b838eeae4
--- /dev/null
+++ b/test/ir/pir/cinn/symbolic/test_dyshape_group_norm.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import unittest
+from os.path import dirname
+
+import numpy as np
+
+import paddle
+from paddle import nn
+from paddle.static import InputSpec
+
+sys.path.append(dirname(dirname(__file__)))
+
+import utils
+
+
+class GroupNorm(nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.hidden_size = 768
+        self.dtype = "float32"
+        self.weight = paddle.randn([128], dtype=self.dtype)
+        self.weight.stop_gradient = False
+        self.bias = paddle.randn([128], dtype=self.dtype)
+        self.bias.stop_gradient = False
+
+        self.data_format = "NHWC"
+
+    def forward(self, x):
+        return paddle.nn.functional.group_norm(
+            x,
+            num_groups=32,
+            epsilon=1e-6,
+            weight=self.weight,
+            bias=self.bias,
+            data_format=self.data_format,
+        )
+
+
+class TestGroupNorm(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2024)
+        self.shape = [1, 128, 256, 128]
+        self.dtype = "float32"
+        self.data_format = "NHWC"
+        self.prepare_data()
+
+    def prepare_data(self):
+        self.x = paddle.randn(self.shape, dtype=self.dtype)
+        self.x.stop_gradient = False
+
+    def check_jit_kernel_info(self, static_fn):
+        utils.check_jit_kernel_number(static_fn, 2)
+        utils.check_jit_kernel_structure(static_fn, {utils.JIT_KERNEL_NAME: 2})
+
+    def eval(self, use_cinn):
+        paddle.seed(2024)
+        net = GroupNorm()
+        input_spec = [
+            InputSpec(shape=[None, None, None, 128], dtype='float32'),
+        ]
+        net = utils.apply_to_static(net, use_cinn, input_spec)
+        net.eval()
+        out = net(self.x)
+        if use_cinn:
+            self.check_jit_kernel_info(net.forward)
+        return out
+
+    def test_eval(self):
+        cinn_out = self.eval(use_cinn=True)
+        dy_out = self.eval(use_cinn=False)
+        np.testing.assert_allclose(
+            cinn_out.numpy(), dy_out.numpy(), atol=1e-6, rtol=1e-6
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/fused_pass/xpu/test_group_norm_silu_xpu_fuse_pass.py b/test/ir/pir/fused_pass/onednn/test_placement_pass_mean_op.py
similarity index 52%
rename from test/ir/pir/fused_pass/xpu/test_group_norm_silu_xpu_fuse_pass.py
rename to test/ir/pir/fused_pass/onednn/test_placement_pass_mean_op.py
index 3a515d7d62b66..6443a60c331f9 100644
--- a/test/ir/pir/fused_pass/xpu/test_group_norm_silu_xpu_fuse_pass.py
+++ b/test/ir/pir/fused_pass/onednn/test_placement_pass_mean_op.py
@@ -11,34 +11,18 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import unittest
 
 import numpy as np
 from pass_test import PassTest
 
 import paddle
-from paddle.base import core
 
 paddle.enable_static()
 
 
-class TestGroupNormSiluXpuFusePattern(PassTest):
-    r"""
-                      X
-              Scale   |   Bias
-                   \  |  /
-                  group norm
-                   /  |  \
-                  /   |   \
-            variance  |   mean
-                      |
-                     silu
-                      |
-                    output
-    """
-
-    def is_program_valid(self, program):
+class TestMeanPlacementPass(PassTest):
+    def is_program_valid(self, program=None):
         return True
 
     def build_ir_program(self):
@@ -46,40 +30,28 @@ def build_ir_program(self):
             main_prog = paddle.static.Program()
             start_prog = paddle.static.Program()
             with paddle.pir.core.program_guard(main_prog, start_prog):
-                channels = 128
-                groups = 32
                 x = paddle.static.data(
-                    name='X', shape=[1, channels, 64, 64], dtype='float32'
+                    name='x', shape=[5, 2, 5, 5], dtype='float32'
                 )
+                mean = paddle.mean(x)
+                out = paddle.assign(mean)
+                self.pass_attr_list = [{'onednn_placement_pass': {}}]
 
-                group_norm = paddle.nn.GroupNorm(groups, channels)
-                silu = paddle.nn.Silu()
-
-                group_norm_out = group_norm(x)
-                out = silu(group_norm_out)
-                out = paddle.assign(out)
-                self.pass_attr_list = [{'group_norm_silu_xpu_fuse_pass': {}}]
                 self.feeds = {
-                    "X": np.random.random((1, channels, 64, 64)).astype(
-                        "float32"
-                    ),
+                    "x": np.random.random((5, 2, 5, 5)).astype("float32"),
                 }
                 self.fetch_list = [out]
                 self.valid_op_map = {
-                    "pd_op.group_norm": 0,
-                    "pd_op.silu": 0,
-                    "pd_op.group_norm_silu_xpu": 1,
+                    "onednn_op.mean": 1,
                 }
                 return [main_prog, start_prog]
 
-    def setUp(self):
-        if core.is_compiled_with_xpu():
-            self.places.append(paddle.XPUPlace(0))
-        self.skip_accuracy_verification = True
-
     def sample_program(self):
         yield self.build_ir_program(), False
 
+    def setUp(self):
+        self.places.append(paddle.CPUPlace())
+
     def test_check_output(self):
         self.check_pass_correct()
 
diff --git a/test/ir/pir/fused_pass/pass_test.py b/test/ir/pir/fused_pass/pass_test.py
index 3bb937ec59771..c5066bad6b34f 100644
--- a/test/ir/pir/fused_pass/pass_test.py
+++ b/test/ir/pir/fused_pass/pass_test.py
@@ -69,7 +69,7 @@ def run_program(self, executor, startup_program, main_program):
                 fetches = executor.run(
                     main_program,
                     feed=self.feeds,
-                    fetch_list=self.fetch_list,
+                    fetch_list=main_program.list_vars()[-1],
                 )
                 return fetches
 
diff --git a/test/ir/pir/fused_pass/test_add_norm_fuse_pass.py b/test/ir/pir/fused_pass/test_add_norm_fuse_pass.py
index fac6e62bc2278..addb443cb70f8 100644
--- a/test/ir/pir/fused_pass/test_add_norm_fuse_pass.py
+++ b/test/ir/pir/fused_pass/test_add_norm_fuse_pass.py
@@ -21,8 +21,6 @@
 from paddle.base import core
 from paddle.pir.core import create_parameter
 
-paddle.enable_static()
-
 
 class TestRmsNormFusePattern(PassTest):
     r"""
@@ -284,7 +282,7 @@ class TestAddLayerNormFusePattern(TestRmsNormFusePattern):
     def sample_program(self):
         for x_shape in [[1, 1, 4096]]:
             for w_shape in [[4096]]:
-                for w_type in ['float32']:
+                for x_type in ['float32', 'float16']:
                     for epilson in [1e-6]:
                         with paddle.pir_utils.IrGuard():
                             start_prog = paddle.static.Program()
@@ -295,10 +293,10 @@ def sample_program(self):
                                 residual = paddle.static.data(
                                     name='residual',
                                     shape=x_shape,
-                                    dtype='float32',
+                                    dtype=x_type,
                                 )
                                 x = paddle.static.data(
-                                    name='x', shape=x_shape, dtype='float32'
+                                    name='x', shape=x_shape, dtype=x_type
                                 )
                                 w_attr = paddle.ParamAttr(
                                     learning_rate=0.0,
@@ -306,13 +304,19 @@ def sample_program(self):
                                         mean=0.0, std=2.0
                                     ),
                                 )
+                                b_attr = paddle.ParamAttr(
+                                    learning_rate=0.0,
+                                    initializer=paddle.nn.initializer.Normal(
+                                        mean=0.0, std=2.0
+                                    ),
+                                )
                                 w1 = create_parameter(
                                     name="w1",
                                     shape=w_shape,
-                                    dtype=w_type,
+                                    dtype=x_type,
                                     initializer=paddle.nn.initializer.Assign(
                                         np.random.random([4096, 4096]).astype(
-                                            w_type
+                                            x_type
                                         )
                                     ),
                                 )
@@ -322,6 +326,7 @@ def sample_program(self):
                                     add_out.shape[-1:],
                                     epsilon=epilson,
                                     weight_attr=w_attr,
+                                    bias_attr=b_attr,
                                 )
                                 layer_norm_out = layer_norm(add_out)
                                 matmul_out = paddle.matmul(layer_norm_out, w1)
@@ -332,11 +337,11 @@ def sample_program(self):
                                 ]
                                 self.feeds = {
                                     "x": np.random.random(x_shape).astype(
-                                        "float32"
+                                        x_type
                                     ),
                                     "residual": np.random.random(
                                         x_shape
-                                    ).astype("float32"),
+                                    ).astype(x_type),
                                 }
                                 self.fetch_list = [out]
                                 self.valid_op_map = {
@@ -350,5 +355,202 @@ def test_check_output(self):
         self.check_pass_correct(atol=1e-3, rtol=1e-3)
 
 
+class TestAddGroupNormPattern_FP16(PassTest):
+    r"""
+    x         residual
+    |           |
+         add
+          |
+      group_norm
+    """
+
+    def is_program_valid(self, program=None):
+        return True
+
+    def sample_program(self):
+        for x_shape in [[2, 6, 4, 2]]:
+            for residual_shape in [[1, 6, 1, 1]]:
+                for dtype in ['float16']:
+                    for epilson in [1e-5]:
+                        for groups in [2]:
+                            for data_layout in ['NCHW']:
+                                rand_value = (
+                                    0.001
+                                    * paddle.rand(
+                                        shape=[x_shape[1]], dtype=dtype
+                                    ).numpy()
+                                )
+                                with paddle.pir_utils.IrGuard():
+                                    start_prog = paddle.static.Program()
+                                    main_prog = paddle.static.Program()
+                                    with paddle.pir.core.program_guard(
+                                        main_prog, start_prog
+                                    ):
+                                        residual = paddle.static.data(
+                                            name='residual',
+                                            shape=residual_shape,
+                                            dtype=dtype,
+                                        )
+                                        x = paddle.static.data(
+                                            name='x', shape=x_shape, dtype=dtype
+                                        )
+                                        w = create_parameter(
+                                            shape=[x_shape[1]],
+                                            dtype=dtype,
+                                            initializer=paddle.nn.initializer.Assign(
+                                                rand_value
+                                            ),
+                                        )
+                                        b = create_parameter(
+                                            shape=[residual_shape[1]],
+                                            dtype=dtype,
+                                            initializer=paddle.nn.initializer.Assign(
+                                                rand_value
+                                            ),
+                                        )
+                                        add_out = paddle.add(x, residual)
+
+                                        group_norm_out = (
+                                            paddle.nn.functional.group_norm(
+                                                add_out,
+                                                num_groups=groups,
+                                                epsilon=epilson,
+                                                weight=w,
+                                                bias=b,
+                                                data_format=data_layout,
+                                            )
+                                        )
+                                        out = paddle.assign(group_norm_out)
+                                        self.pass_attr_list = [
+                                            {'add_norm_fuse_pass': {}},
+                                            {'transfer_layout_pass': {}},
+                                            {
+                                                'remove_redundant_transpose_pass': {}
+                                            },
+                                        ]
+                                        self.feeds = {
+                                            "x": np.random.random(
+                                                x_shape
+                                            ).astype(dtype),
+                                            "residual": np.random.random(
+                                                residual_shape
+                                            ).astype(dtype),
+                                        }
+                                        self.fetch_list = [out]
+                                        self.valid_op_map = {
+                                            "pa_op.add": 0,
+                                            "pd_op.group_norm": 0,
+                                            "pd_op.add_group_norm_silu": 1,
+                                        }
+                                        yield [main_prog, start_prog], False
+
+    def setUp(self):
+        if core.is_compiled_with_cuda():
+            self.places.append(paddle.CUDAPlace(0))
+
+    def test_check_output(self):
+        self.check_pass_correct()
+
+
+class TestAddGroupNormPatternSilu_FP16(PassTest):
+    r"""
+    x         residual
+    |           |
+         add
+          |
+      group_norm
+    """
+
+    def is_program_valid(self, program=None):
+        return True
+
+    def sample_program(self):
+        for x_shape in [[2, 6, 4, 2]]:
+            for residual_shape in [[1, 6, 1, 1]]:
+                for dtype in ['float16']:
+                    for epilson in [1e-5]:
+                        for groups in [2]:
+                            for data_layout in ['NCHW']:
+                                rand_value = (
+                                    0.001
+                                    * paddle.rand(
+                                        shape=[x_shape[1]], dtype=dtype
+                                    ).numpy()
+                                )
+                                with paddle.pir_utils.IrGuard():
+                                    start_prog = paddle.static.Program()
+                                    main_prog = paddle.static.Program()
+                                    with paddle.pir.core.program_guard(
+                                        main_prog, start_prog
+                                    ):
+                                        residual = paddle.static.data(
+                                            name='residual',
+                                            shape=residual_shape,
+                                            dtype=dtype,
+                                        )
+                                        x = paddle.static.data(
+                                            name='x', shape=x_shape, dtype=dtype
+                                        )
+                                        w = create_parameter(
+                                            shape=[x_shape[1]],
+                                            dtype=dtype,
+                                            initializer=paddle.nn.initializer.Assign(
+                                                rand_value
+                                            ),
+                                        )
+                                        b = create_parameter(
+                                            shape=[x_shape[1]],
+                                            dtype=dtype,
+                                            initializer=paddle.nn.initializer.Assign(
+                                                rand_value
+                                            ),
+                                        )
+                                        add_out = paddle.add(x, residual)
+                                        group_norm_out = (
+                                            paddle.nn.functional.group_norm(
+                                                add_out,
+                                                num_groups=groups,
+                                                epsilon=epilson,
+                                                weight=w,
+                                                bias=b,
+                                                data_format=data_layout,
+                                            )
+                                        )
+                                        out = paddle.nn.functional.silu(
+                                            group_norm_out
+                                        )
+                                        out = paddle.assign(out)
+                                        self.pass_attr_list = [
+                                            {'add_norm_fuse_pass': {}},
+                                            {'transfer_layout_pass': {}},
+                                            {
+                                                'remove_redundant_transpose_pass': {}
+                                            },
+                                        ]
+                                        self.feeds = {
+                                            "x": np.random.random(
+                                                x_shape
+                                            ).astype(dtype),
+                                            "residual": np.random.random(
+                                                residual_shape
+                                            ).astype(dtype),
+                                        }
+                                        self.fetch_list = [out]
+                                        self.valid_op_map = {
+                                            "pd_op.silu": 0,
+                                            "pd_op.add": 0,
+                                            "pd_op.group_norm": 0,
+                                            "pd_op.add_group_norm_silu": 1,
+                                        }
+                                        yield [main_prog, start_prog], False
+
+    def setUp(self):
+        if core.is_compiled_with_cuda():
+            self.places.append(paddle.CUDAPlace(0))
+
+    def test_check_output(self):
+        self.check_pass_correct()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/ir/pir/fused_pass/test_group_norm_silu_fuse_pass.py b/test/ir/pir/fused_pass/test_group_norm_silu_fuse_pass.py
new file mode 100644
index 0000000000000..c6f1411d5cfcf
--- /dev/null
+++ b/test/ir/pir/fused_pass/test_group_norm_silu_fuse_pass.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from pass_test import PassTest
+
+import paddle
+from paddle.base import core
+from paddle.pir.core import create_parameter
+
+
+class GroupNormSiluPattern(PassTest):
+    r"""
+    group_norm
+        |
+      silu
+    """
+
+    def is_program_valid(self, program=None):
+        return True
+
+    def sample_program(self):
+        for x_shape in [[2, 6, 4, 2]]:
+            dtype = None
+            if core.is_compiled_with_xpu():
+                dtype = 'float32'
+            elif core.is_compiled_with_cuda():
+                dtype = 'float16'
+            for epilson in [1e-5]:
+                for groups in [2]:
+                    rand_value = (
+                        0.001
+                        * paddle.rand(shape=[x_shape[1]], dtype=dtype).numpy()
+                    )
+                    with paddle.pir_utils.IrGuard():
+                        start_prog = paddle.static.Program()
+                        main_prog = paddle.static.Program()
+                        with paddle.pir.core.program_guard(
+                            main_prog, start_prog
+                        ):
+                            x = paddle.static.data(
+                                name='x', shape=x_shape, dtype=dtype
+                            )
+                            w = create_parameter(
+                                shape=[x_shape[1]],
+                                dtype=dtype,
+                                initializer=paddle.nn.initializer.Assign(
+                                    rand_value
+                                ),
+                            )
+                            b = create_parameter(
+                                shape=[x_shape[1]],
+                                dtype=dtype,
+                                initializer=paddle.nn.initializer.Assign(
+                                    rand_value
+                                ),
+                            )
+                            group_norm_out = paddle.nn.functional.group_norm(
+                                x,
+                                num_groups=groups,
+                                epsilon=epilson,
+                                weight=w,
+                                bias=b,
+                            )
+                            out = paddle.nn.functional.silu(group_norm_out)
+                            out = paddle.assign(out)
+                            if core.is_compiled_with_xpu():
+                                self.pass_attr_list = [
+                                    {'group_norm_silu_fuse_pass': {}},
+                                ]
+                            elif core.is_compiled_with_cuda():
+                                self.pass_attr_list = [
+                                    {'group_norm_silu_fuse_pass': {}},
+                                    {'transfer_layout_pass': {}},
+                                ]
+                            self.feeds = {
+                                "x": np.random.random(x_shape).astype(dtype),
+                            }
+                            self.fetch_list = [out]
+                            if core.is_compiled_with_xpu():
+                                self.valid_op_map = {
+                                    "pd_op.silu": 0,
+                                    "pd_op.group_norm": 0,
+                                    "pd_op.group_norm_silu_xpu": 1,
+                                }
+                            elif core.is_compiled_with_cuda():
+                                self.valid_op_map = {
+                                    "pd_op.silu": 0,
+                                    "pd_op.group_norm": 0,
+                                    "pd_op.add_group_norm_silu": 1,
+                                }
+
+                            yield [main_prog, start_prog], False
+
+    def setUp(self):
+        if core.is_compiled_with_xpu():
+            self.places.append(paddle.XPUPlace(0))
+        elif core.is_compiled_with_cuda():
+            self.places.append(paddle.CUDAPlace(0))
+
+    def test_check_output(self):
+        self.check_pass_correct()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/deprecated/ir/pir/test_build_op.py b/test/ir/pir/test_build_op.py
similarity index 88%
rename from test/deprecated/ir/pir/test_build_op.py
rename to test/ir/pir/test_build_op.py
index cd0ae03b33958..ac92d124a0dc5 100644
--- a/test/deprecated/ir/pir/test_build_op.py
+++ b/test/ir/pir/test_build_op.py
@@ -22,19 +22,20 @@
 
 def get_ir_program():
     paddle.enable_static()
-    x = paddle.randn([4, 4])
-    main_program, start_program = (
-        paddle.static.Program(),
-        paddle.static.Program(),
-    )
-    with paddle.static.program_guard(main_program, start_program):
-        x_s = paddle.static.data('x', [4, 4], x.dtype)
-        x_s.stop_gradient = False
-        y_s = paddle.matmul(x_s, x_s)
-        y_s = paddle.add(x_s, y_s)
-        y_s = paddle.tanh(y_s)
-    pir_program = pir.translate_to_pir(main_program.desc)
-    return pir_program
+    with paddle.pir_utils.OldIrGuard():
+        x = paddle.randn([4, 4])
+        main_program, start_program = (
+            paddle.static.Program(),
+            paddle.static.Program(),
+        )
+        with paddle.static.program_guard(main_program, start_program):
+            x_s = paddle.static.data('x', [4, 4], x.dtype)
+            x_s.stop_gradient = False
+            y_s = paddle.matmul(x_s, x_s)
+            y_s = paddle.add(x_s, y_s)
+            y_s = paddle.tanh(y_s)
+        pir_program = pir.translate_to_pir(main_program.desc)
+        return pir_program
 
 
 class TestBuildOp(unittest.TestCase):
diff --git a/test/deprecated/ir/pir/test_ir_backward.py b/test/ir/pir/test_ir_backward.py
similarity index 85%
rename from test/deprecated/ir/pir/test_ir_backward.py
rename to test/ir/pir/test_ir_backward.py
index 3f8a77eed354f..c1818aa493a37 100644
--- a/test/deprecated/ir/pir/test_ir_backward.py
+++ b/test/ir/pir/test_ir_backward.py
@@ -26,23 +26,21 @@
 
 def get_ir_program_0():
     paddle.enable_static()
-    x = paddle.randn([4, 4])
-    main_program, start_program = (
-        paddle.static.Program(),
-        paddle.static.Program(),
-    )
-    with paddle.static.program_guard(main_program, start_program):
-        x_s = paddle.static.data('x', [4, 4], x.dtype)
-        x_s.stop_gradient = False
-        k_s = paddle.tanh(x_s)
-    pir_program = pir.translate_to_pir(main_program.desc)
-    return pir_program
+    with paddle.pir_utils.OldIrGuard():
+        x = paddle.randn([4, 4])
+        main_program, start_program = (
+            paddle.static.Program(),
+            paddle.static.Program(),
+        )
+        with paddle.static.program_guard(main_program, start_program):
+            x_s = paddle.static.data('x', [4, 4], x.dtype)
+            x_s.stop_gradient = False
+            k_s = paddle.tanh(x_s)
+        pir_program = pir.translate_to_pir(main_program.desc)
+        return pir_program
 
 
 class TesBackward_1(unittest.TestCase):
-    def tearDown(self) -> None:
-        paddle.framework.set_flags({"FLAGS_enable_pir_api": False})
-
     def test_grad(self):
         pir_program = get_ir_program_0()
         input = pir_program.global_block().ops[-1].operand(0).source()
@@ -138,26 +136,24 @@ def test_split(self):
 
 def get_ir_program_1():
     paddle.enable_static()
-    x = paddle.randn([2, 2])
-    main_program, start_program = (
-        paddle.static.Program(),
-        paddle.static.Program(),
-    )
-    with paddle.static.program_guard(main_program, start_program):
-        x_s = paddle.static.data('x', [4, 4], x.dtype)
-        x_s.stop_gradient = False
-
-        k_s = paddle.tanh(x_s)
-        z_x = paddle.tanh(x_s)
-        out = paddle.add(z_x, k_s)
-    pir_program = pir.translate_to_pir(main_program.desc)
-    return pir_program
+    with paddle.pir_utils.OldIrGuard():
+        x = paddle.randn([2, 2])
+        main_program, start_program = (
+            paddle.static.Program(),
+            paddle.static.Program(),
+        )
+        with paddle.static.program_guard(main_program, start_program):
+            x_s = paddle.static.data('x', [4, 4], x.dtype)
+            x_s.stop_gradient = False
+
+            k_s = paddle.tanh(x_s)
+            z_x = paddle.tanh(x_s)
+            out = paddle.add(z_x, k_s)
+        pir_program = pir.translate_to_pir(main_program.desc)
+        return pir_program
 
 
 class TesBackward_2(unittest.TestCase):
-    def tearDown(self) -> None:
-        paddle.framework.set_flags({"FLAGS_enable_pir_api": False})
-
     def test_add_n(self):
         pir_program = get_ir_program_1()
         input_x = pir_program.global_block().ops[-3].operand(0).source()
@@ -216,23 +212,21 @@ def test_concat(self):
 
 def get_ir_program_2():
     paddle.enable_static()
-    x = paddle.randn([2, 2])
-    main_program, start_program = (
-        paddle.static.Program(),
-        paddle.static.Program(),
-    )
-    with paddle.static.program_guard(main_program, start_program):
-        x_s = paddle.static.data('x', [4, 4], x.dtype)
-        x_s.stop_gradient = False
-        k_s = paddle.sum(x_s, axis=(-1,), keepdim=False)
-    pir_program = pir.translate_to_pir(main_program.desc)
-    return pir_program
+    with paddle.pir_utils.OldIrGuard():
+        x = paddle.randn([2, 2])
+        main_program, start_program = (
+            paddle.static.Program(),
+            paddle.static.Program(),
+        )
+        with paddle.static.program_guard(main_program, start_program):
+            x_s = paddle.static.data('x', [4, 4], x.dtype)
+            x_s.stop_gradient = False
+            k_s = paddle.sum(x_s, axis=(-1,), keepdim=False)
+        pir_program = pir.translate_to_pir(main_program.desc)
+        return pir_program
 
 
 class TestBackward_3(unittest.TestCase):
-    def tearDown(self) -> None:
-        paddle.framework.set_flags({"FLAGS_enable_pir_api": False})
-
     def test_basic_network(self):
         pir_program = get_ir_program_2()
         x = pir_program.global_block().ops[-1].operand(0).source()
@@ -250,9 +244,6 @@ def test_basic_network(self):
 
 
 class TestBackward_4(unittest.TestCase):
-    def tearDown(self) -> None:
-        paddle.framework.set_flags({"FLAGS_enable_pir_api": False})
-
     def test_basic_network(self):
         if not paddle.framework.in_pir_mode():
             return
@@ -293,9 +284,6 @@ def false_func():
 
 
 class TestBackward_5(unittest.TestCase):
-    def tearDown(self) -> None:
-        paddle.framework.set_flags({"FLAGS_enable_pir_api": False})
-
     def test_skip_vjp(self):
         if not paddle.framework.in_pir_mode():
             return
diff --git a/test/deprecated/ir/pir/test_ir_pybind.py b/test/ir/pir/test_ir_pybind.py
similarity index 80%
rename from test/deprecated/ir/pir/test_ir_pybind.py
rename to test/ir/pir/test_ir_pybind.py
index afe8b57385379..62f9066c58c37 100644
--- a/test/deprecated/ir/pir/test_ir_pybind.py
+++ b/test/ir/pir/test_ir_pybind.py
@@ -22,21 +22,22 @@
 
 
 def get_ir_program():
-    x = paddle.randn([4, 4])
-    main_program, start_program = (
-        paddle.static.Program(),
-        paddle.static.Program(),
-    )
-    with paddle.static.program_guard(main_program, start_program):
-        x_s = paddle.static.data('x', [4, 4], x.dtype)
-        x_s.stop_gradient = False
-        y_s = paddle.matmul(x_s, x_s)
-        z_s = paddle.add(y_s, y_s)
-        k_s = paddle.tanh(z_s)
-        q_s = paddle.unsqueeze(k_s, [2])
-
-    pir_program = pir.translate_to_pir(main_program.desc)
-    return pir_program
+    with paddle.pir_utils.OldIrGuard():
+        x = paddle.randn([4, 4])
+        main_program, start_program = (
+            paddle.static.Program(),
+            paddle.static.Program(),
+        )
+        with paddle.static.program_guard(main_program, start_program):
+            x_s = paddle.static.data('x', [4, 4], x.dtype)
+            x_s.stop_gradient = False
+            y_s = paddle.matmul(x_s, x_s)
+            z_s = paddle.add(y_s, y_s)
+            k_s = paddle.tanh(z_s)
+            q_s = paddle.unsqueeze(k_s, [2])
+
+        pir_program = pir.translate_to_pir(main_program.desc)
+        return pir_program
 
 
 class TestPybind(unittest.TestCase):
@@ -165,38 +166,43 @@ def test_type(self):
         self.assertEqual(add_op.result(0).is_selected_row_type(), True)
 
     def test_attr(self):
-        main_program, start_program = (
-            paddle.static.Program(),
-            paddle.static.Program(),
-        )
-        with paddle.static.program_guard(main_program, start_program):
-            conv_data = paddle.static.data(
-                'conv_data', [None, 3, 32, 32], dtype='float32'
+        with paddle.pir_utils.OldIrGuard():
+            main_program, start_program = (
+                paddle.static.Program(),
+                paddle.static.Program(),
             )
-            conv2d_out = paddle.static.nn.conv2d(
-                input=conv_data,
-                num_filters=2,
-                filter_size=3,
-                stride=3,
-                act="relu",
+            with paddle.static.program_guard(main_program, start_program):
+                conv_data = paddle.static.data(
+                    'conv_data', [None, 3, 32, 32], dtype='float32'
+                )
+                conv2d_out = paddle.static.nn.conv2d(
+                    input=conv_data,
+                    num_filters=2,
+                    filter_size=3,
+                    stride=3,
+                    act="relu",
+                )
+                full_out = paddle.tensor.fill_constant(
+                    shape=[4, 4], dtype="float32", value=2
+                )
+
+            pir_program = pir.translate_to_pir(main_program.desc)
+            conv_attr = pir_program.global_block().ops[3].attrs()
+            full_attr = pir_program.global_block().ops[8].attrs()
+            self.assertEqual(conv_attr["stop_gradient"], [False])
+            self.assertEqual(conv_attr["dilations"], [1, 1])
+            self.assertEqual(conv_attr["data_format"], "NCHW")
+            self.assertEqual(conv_attr["strides"], [3, 3])
+            self.assertEqual(conv_attr["paddings"], [0, 0])
+            self.assertEqual(conv_attr["padding_algorithm"], "EXPLICIT")
+            self.assertEqual(conv_attr["groups"], 1)
+            self.assertEqual(
+                full_attr["dtype"], paddle.base.core.DataType.FLOAT32
             )
-            full_out = paddle.tensor.fill_constant(
-                shape=[4, 4], dtype="float32", value=2
+            self.assertTrue(
+                isinstance(full_attr["place"], paddle.base.core.Place)
             )
 
-        pir_program = pir.translate_to_pir(main_program.desc)
-        conv_attr = pir_program.global_block().ops[3].attrs()
-        full_attr = pir_program.global_block().ops[8].attrs()
-        self.assertEqual(conv_attr["stop_gradient"], [False])
-        self.assertEqual(conv_attr["dilations"], [1, 1])
-        self.assertEqual(conv_attr["data_format"], "NCHW")
-        self.assertEqual(conv_attr["strides"], [3, 3])
-        self.assertEqual(conv_attr["paddings"], [0, 0])
-        self.assertEqual(conv_attr["padding_algorithm"], "EXPLICIT")
-        self.assertEqual(conv_attr["groups"], 1)
-        self.assertEqual(full_attr["dtype"], paddle.base.core.DataType.FLOAT32)
-        self.assertTrue(isinstance(full_attr["place"], paddle.base.core.Place))
-
     def test_operands(self):
         pir_program = get_ir_program()
         matmul_op = pir_program.global_block().ops[1]
diff --git a/test/deprecated/ir/pir/test_ir_vjp.py b/test/ir/pir/test_ir_vjp.py
similarity index 62%
rename from test/deprecated/ir/pir/test_ir_vjp.py
rename to test/ir/pir/test_ir_vjp.py
index 8401761ba3a05..53268e7026422 100644
--- a/test/deprecated/ir/pir/test_ir_vjp.py
+++ b/test/ir/pir/test_ir_vjp.py
@@ -22,17 +22,20 @@
 
 
 def get_ir_program():
-    main_program, start_program = (
-        paddle.static.Program(),
-        paddle.static.Program(),
-    )
-    with paddle.static.program_guard(main_program, start_program):
-        x = paddle.static.data('x', [4, 4], 'float32')
-        x.stop_gradient = False
-        paddle.tanh(x)
-        paddle.tensor.fill_constant(shape=[4, 4], dtype='float32', value=2.0)
-    pir_program = pir.translate_to_pir(main_program.desc)
-    return pir_program
+    with paddle.pir_utils.OldIrGuard():
+        main_program, start_program = (
+            paddle.static.Program(),
+            paddle.static.Program(),
+        )
+        with paddle.static.program_guard(main_program, start_program):
+            x = paddle.static.data('x', [4, 4], 'float32')
+            x.stop_gradient = False
+            paddle.tanh(x)
+            paddle.tensor.fill_constant(
+                shape=[4, 4], dtype='float32', value=2.0
+            )
+        pir_program = pir.translate_to_pir(main_program.desc)
+        return pir_program
 
 
 class TestTanhVjp(unittest.TestCase):
@@ -92,20 +95,23 @@ def test_tanh_vjp2(self):
 
 class TestMeanVjp(unittest.TestCase):
     def test_mean_vjp1(self):
-        main_program, start_program = (
-            paddle.static.Program(),
-            paddle.static.Program(),
-        )
-        with paddle.static.program_guard(main_program, start_program):
-            x = paddle.static.data('x', [4, 4], 'float32')
-            x.stop_gradient = False
-            paddle.mean(x, axis=[0, 1])
-            paddle.tensor.fill_constant(shape=[1], dtype='float32', value=2.0)
-        pir_program = pir.translate_to_pir(main_program.desc)
-        fill_constant_op = pir_program.global_block().ops[-1]
-        mean_op = pir_program.global_block().ops[-2]
-        out_grads = [[fill_constant_op.result(0)]]
-        stop_gradients = [[False]]
+        with paddle.pir_utils.OldIrGuard():
+            main_program, start_program = (
+                paddle.static.Program(),
+                paddle.static.Program(),
+            )
+            with paddle.static.program_guard(main_program, start_program):
+                x = paddle.static.data('x', [4, 4], 'float32')
+                x.stop_gradient = False
+                paddle.mean(x, axis=[0, 1])
+                paddle.tensor.fill_constant(
+                    shape=[1], dtype='float32', value=2.0
+                )
+            pir_program = pir.translate_to_pir(main_program.desc)
+            fill_constant_op = pir_program.global_block().ops[-1]
+            mean_op = pir_program.global_block().ops[-2]
+            out_grads = [[fill_constant_op.result(0)]]
+            stop_gradients = [[False]]
         with paddle.pir.core.program_guard(pir_program):
             grad_outs = call_vjp(
                 mean_op,
@@ -138,20 +144,23 @@ def test_mean_vjp1(self):
             self.assertEqual(len(pir_program.global_block().ops), 4)
 
     def test_mean_vjp2(self):
-        main_program, start_program = (
-            paddle.static.Program(),
-            paddle.static.Program(),
-        )
-        with paddle.static.program_guard(main_program, start_program):
-            x = paddle.static.data('x', [4, 4], 'float32')
-            x.stop_gradient = False
-            paddle.mean(x, axis=[0, 1])
-            paddle.tensor.fill_constant(shape=[1], dtype='float32', value=2.0)
-        pir_program = pir.translate_to_pir(main_program.desc)
-        fill_constant_op = pir_program.global_block().ops[-1]
-        mean_op = pir_program.global_block().ops[-2]
-        out_grads = [[fill_constant_op.result(0)]]
-        stop_gradients = [[True]]
+        with paddle.pir_utils.OldIrGuard():
+            main_program, start_program = (
+                paddle.static.Program(),
+                paddle.static.Program(),
+            )
+            with paddle.static.program_guard(main_program, start_program):
+                x = paddle.static.data('x', [4, 4], 'float32')
+                x.stop_gradient = False
+                paddle.mean(x, axis=[0, 1])
+                paddle.tensor.fill_constant(
+                    shape=[1], dtype='float32', value=2.0
+                )
+            pir_program = pir.translate_to_pir(main_program.desc)
+            fill_constant_op = pir_program.global_block().ops[-1]
+            mean_op = pir_program.global_block().ops[-2]
+            out_grads = [[fill_constant_op.result(0)]]
+            stop_gradients = [[True]]
         with paddle.pir.core.program_guard(pir_program):
             grad_outs = call_vjp(
                 mean_op,
@@ -165,20 +174,23 @@ def test_mean_vjp2(self):
 
 class TesthasVjp(unittest.TestCase):
     def test_has_vjp(self):
-        main_program, start_program = (
-            paddle.static.Program(),
-            paddle.static.Program(),
-        )
-        with paddle.static.program_guard(main_program, start_program):
-            x = paddle.static.data('x', [4, 4], 'float32')
-            x.stop_gradient = False
-            paddle.mean(x, axis=[0, 1])
-            paddle.tensor.fill_constant(shape=[1], dtype='float32', value=2.0)
-        pir_program = pir.translate_to_pir(main_program.desc)
-        fill_constant_op = pir_program.global_block().ops[-1]
-        mean_op = pir_program.global_block().ops[-2]
-        self.assertEqual(has_vjp(fill_constant_op), False)
-        self.assertEqual(has_vjp(mean_op), True)
+        with paddle.pir_utils.OldIrGuard():
+            main_program, start_program = (
+                paddle.static.Program(),
+                paddle.static.Program(),
+            )
+            with paddle.static.program_guard(main_program, start_program):
+                x = paddle.static.data('x', [4, 4], 'float32')
+                x.stop_gradient = False
+                paddle.mean(x, axis=[0, 1])
+                paddle.tensor.fill_constant(
+                    shape=[1], dtype='float32', value=2.0
+                )
+            pir_program = pir.translate_to_pir(main_program.desc)
+            fill_constant_op = pir_program.global_block().ops[-1]
+            mean_op = pir_program.global_block().ops[-2]
+            self.assertEqual(has_vjp(fill_constant_op), False)
+            self.assertEqual(has_vjp(mean_op), True)
 
 
 if __name__ == "__main__":
diff --git a/test/ir/pir/test_pass_manager.py b/test/ir/pir/test_pass_manager.py
new file mode 100644
index 0000000000000..92113fabd5842
--- /dev/null
+++ b/test/ir/pir/test_pass_manager.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle
+from paddle import pir
+from paddle.base import core
+from paddle.framework import LayerHelper
+
+paddle.enable_static()
+
+
+class TestShadowOutputSlice(unittest.TestCase):
+    def test_op(self):
+        with paddle.pir_utils.OldIrGuard():
+            place = core.Place()
+            place.set_place(paddle.CPUPlace())
+            new_scope = paddle.static.Scope()
+            main_program = paddle.static.Program()
+            with paddle.static.scope_guard(new_scope):
+                with paddle.static.program_guard(main_program):
+                    x = paddle.ones([3, 9, 5], dtype='float32')
+                    y = paddle.static.data(
+                        name="y", shape=[3, 9, 5], dtype="float32"
+                    )
+                    z = x * y  # will be eliminated
+
+                    _, out, _ = paddle.split(x, num_or_sections=3, axis=1)
+                    helper = LayerHelper('shadow_output')
+                    helper.append_op(
+                        type="shadow_output",
+                        inputs={"x": [out.name]},
+                        outputs={"out": [y.name]},
+                        attrs={"name": out.name},
+                    )
+
+            new_program = pir.translate_to_pir(main_program.desc)
+            op_names = [op.name() for op in new_program.global_block().ops]
+            self.assertTrue('pd_op.multiply' in op_names)
+            pm = pir.PassManager()
+            pm.add_pass(
+                'dead_code_elimination_pass', {}
+            )  # apply pass to eliminate dead code
+            pm.run(new_program)
+            op_names = [op.name() for op in new_program.global_block().ops]
+            self.assertEqual(pm.passes(), ['dead_code_elimination_pass'])
+            self.assertFalse(pm.empty())
+            self.assertTrue(
+                'pd_op.multiply' not in op_names
+            )  # multiply is eliminated because its output is not used
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/ir/pir/test_special_op_translator.py b/test/ir/pir/test_special_op_translator.py
new file mode 100644
index 0000000000000..09440f2fc48bd
--- /dev/null
+++ b/test/ir/pir/test_special_op_translator.py
@@ -0,0 +1,586 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle import pir
+from paddle.base import core
+from paddle.framework import LayerHelper
+
+paddle.enable_static()
+
+
+class TestCastOpTranscriber(unittest.TestCase):
+    def test_op(self):
+        with paddle.pir_utils.OldIrGuard():
+            place = core.Place()
+            place.set_place(paddle.CPUPlace())
+            new_scope = paddle.static.Scope()
+            main_program = paddle.static.Program()
+            with paddle.static.scope_guard(new_scope):
+                with paddle.static.program_guard(main_program):
+                    x = paddle.to_tensor([2, 3, 4], 'float64')
+                    y = paddle.cast(x, 'uint8')
+
+            _, mappings = pir.translate_to_pir_with_param_map(main_program.desc)
+            assert len(str(mappings)) > 0, "no mapping found"
+
+
+class TestCondWithInplace(unittest.TestCase):
+    def test_op(self):
+        with paddle.pir_utils.OldIrGuard():
+
+            def cond_with_inplace():
+                x = paddle.ones(shape=[2, 1, 2, 3], dtype="float32")
+                y = paddle.ones(shape=[2, 1, 2, 3], dtype="float32")
+                running_mean = paddle.to_tensor([0], dtype="float32")
+                running_variance = paddle.to_tensor([1], dtype="float32")
+                weight = paddle.to_tensor([2], dtype="float32")
+                bias = paddle.to_tensor([1], dtype="float32")
+                if x > y:
+                    y = paddle.nn.functional.batch_norm(
+                        x, running_mean, running_variance, weight, bias
+                    )
+                else:
+                    y = paddle.nn.functional.batch_norm(
+                        x, running_mean, running_variance, weight, bias
+                    )
+
+            legacy_program = paddle.jit.to_static(
+                cond_with_inplace,
+                input_spec=[],
+                full_graph=True,
+            )
+
+            l = pir.translate_to_pir(legacy_program.main_program.desc)
+            assert l is not None
+
+    def test_nested_op(self):
+        with paddle.pir_utils.OldIrGuard():
+
+            def cond_with_inplace():
+                x = paddle.ones(shape=[2, 1, 2, 3], dtype="float32")
+                y = paddle.ones(shape=[2, 1, 2, 3], dtype="float32")
+                z = paddle.ones(shape=[2, 1, 2, 3], dtype="float32")
+                running_mean = paddle.to_tensor([0], dtype="float32")
+                running_variance = paddle.to_tensor([1], dtype="float32")
+                weight = paddle.to_tensor([2], dtype="float32")
+                bias = paddle.to_tensor([1], dtype="float32")
+                if x > y:
+                    if y > z:
+                        z = paddle.nn.functional.batch_norm(
+                            z, running_mean, running_variance, weight, bias
+                        )
+                    else:
+                        y = paddle.nn.functional.batch_norm(
+                            x, running_mean, running_variance, weight, bias
+                        )
+                else:
+                    if y > z:
+                        z = paddle.nn.functional.batch_norm(
+                            z, running_mean, running_variance, weight, bias
+                        )
+                    else:
+                        y = paddle.nn.functional.batch_norm(
+                            x, running_mean, running_variance, weight, bias
+                        )
+
+            legacy_program = paddle.jit.to_static(
+                cond_with_inplace,
+                input_spec=[],
+                full_graph=True,
+            )
+
+            l = pir.translate_to_pir(legacy_program.main_program.desc)
+            assert l is not None
+
+
+class TestElementwiseOpTranscriber(unittest.TestCase):
+    def test_elementwise_without_y_grad(self):
+        with paddle.pir_utils.OldIrGuard():
+            place = core.Place()
+            place.set_place(paddle.CPUPlace())
+            exe = paddle.static.Executor(place)
+
+            new_scope = paddle.static.Scope()
+            main_program = paddle.static.Program()
+            with paddle.static.scope_guard(new_scope):
+                with paddle.static.program_guard(main_program):
+                    x_data = np.random.rand(100, 2, 3)
+                    y_data = np.random.rand(100)
+                    x = paddle.to_tensor(x_data, dtype='float32')
+                    x.stop_gradient = False
+                    y = paddle.to_tensor(y_data, dtype='float32')
+
+                    out1 = paddle.tensor.math._elementwise_op(
+                        LayerHelper('elementwise_add', x=x, y=y, axis=0)
+                    )
+                    out1.stop_gradient = False
+                    mean = paddle.mean(out1)
+                    paddle.static.append_backward(mean)
+
+                    out = exe.run(main_program, {}, fetch_list=[out1])
+                    np.testing.assert_allclose(
+                        out[0],
+                        x_data + y_data.reshape(100, 1, 1),
+                        rtol=1e-6,
+                        atol=1e-6,
+                    )
+
+    def test_elementwise_with_y_grad(self):
+        with paddle.pir_utils.OldIrGuard():
+            place = core.Place()
+            place.set_place(paddle.CPUPlace())
+            exe = paddle.static.Executor(place)
+
+            new_scope = paddle.static.Scope()
+            main_program = paddle.static.Program()
+            with paddle.static.scope_guard(new_scope):
+                with paddle.static.program_guard(main_program):
+                    x_data = np.random.rand(100, 2, 3)
+                    y_data = np.random.rand(100)
+                    x = paddle.to_tensor(x_data, dtype='float32')
+                    x.stop_gradient = False
+                    y = paddle.to_tensor(y_data, dtype='float32')
+                    y.stop_gradient = False
+
+                    out1 = paddle.tensor.math._elementwise_op(
+                        LayerHelper('elementwise_add', x=x, y=y, axis=0)
+                    )
+                    out1.stop_gradient = False
+                    mean = paddle.mean(out1)
+                    paddle.static.append_backward(mean)
+
+                    out = exe.run(main_program, {}, fetch_list=[out1])
+                    np.testing.assert_allclose(
+                        out[0],
+                        x_data + y_data.reshape(100, 1, 1),
+                        rtol=1e-6,
+                        atol=1e-6,
+                    )
+
+    def test_add_inplace(self):
+        with paddle.pir_utils.OldIrGuard():
+            place = core.Place()
+            place.set_place(paddle.CPUPlace())
+            exe = paddle.static.Executor(place)
+
+            new_scope = paddle.static.Scope()
+            main_program = paddle.static.Program()
+            with paddle.static.scope_guard(new_scope):
+                with paddle.static.program_guard(main_program):
+                    x = paddle.ones(shape=(100, 2, 3), dtype='float32')
+                    y = paddle.ones(shape=(100, 2, 3), dtype='float32')
+
+                    helper = LayerHelper('elementwise_add')
+                    helper.append_op(
+                        type="elementwise_add",
+                        inputs={"X": x, "Y": y},
+                        outputs={"Out": y},
+                        attrs={"axis": -1},
+                    )
+            _ = pir.translate_to_pir(main_program.desc)
+
+
+class TestEmbeddingOpTranscriber(unittest.TestCase):
+    def test_op(self):
+        with paddle.pir_utils.OldIrGuard():
+            place = core.Place()
+            place.set_place(paddle.CPUPlace())
+            new_scope = paddle.static.Scope()
+            main_program = paddle.static.Program()
+            with paddle.static.scope_guard(new_scope):
+                with paddle.static.program_guard(main_program):
+                    x = paddle.static.data(
+                        name="x", shape=[2, 4], dtype=np.int64
+                    )
+                    embedding = paddle.nn.Embedding(
+                        10,
+                        3,
+                        weight_attr=paddle.nn.initializer.Constant(value=1.0),
+                    )
+                    output = embedding(x)
+
+            _ = pir.translate_to_pir(main_program.desc)
+
+
+class TestIncrementOpTranscriber(unittest.TestCase):
+    def test_op(self):
+        with paddle.pir_utils.OldIrGuard():
+            place = core.Place()
+            place.set_place(paddle.CPUPlace())
+            new_scope = paddle.static.Scope()
+            main_program = paddle.static.Program()
+            with paddle.static.scope_guard(new_scope):
+                with paddle.static.program_guard(main_program):
+                    data = paddle.zeros(shape=[1], dtype='float32')
+                    counter = paddle.increment(data)
+
+            _ = pir.translate_to_pir(main_program.desc)
+
+
+class TestAssignValueOpTranscriber(unittest.TestCase):
+    def test_op(self):
+        with paddle.pir_utils.OldIrGuard():
+            place = core.Place()
+            place.set_place(paddle.CPUPlace())
+            new_scope = paddle.static.Scope()
+            main_program = paddle.static.Program()
+            with paddle.static.scope_guard(new_scope):
+                with paddle.static.program_guard(main_program):
+                    x = paddle.to_tensor(
+                        [[0.1, 0.2], [0.3, 0.4]],
+                        place=paddle.CPUPlace(),
+                        stop_gradient=False,
+                    )
+
+            _ = pir.translate_to_pir(main_program.desc)
+
+
+class TestRnnOpTranscriber(unittest.TestCase):
+    def test_op(self):
+        with paddle.pir_utils.OldIrGuard():
+            place = core.Place()
+            place.set_place(paddle.CPUPlace())
+            new_scope = paddle.static.Scope()
+            main_program = paddle.static.Program()
+            with paddle.static.scope_guard(new_scope):
+                with paddle.static.program_guard(main_program):
+                    x = paddle.randn((4, 16))
+                    prev_h = paddle.randn((4, 32))
+
+                    cell = paddle.nn.SimpleRNNCell(16, 32)
+                    y, h = cell(x, prev_h)
+
+            _ = pir.translate_to_pir(main_program.desc)
+
+
+class TestEmptyVarTranslate(unittest.TestCase):
+    def test_op(self):
+        with paddle.pir_utils.OldIrGuard():
+            place = core.Place()
+            place.set_place(paddle.CPUPlace())
+            new_scope = paddle.static.Scope()
+            main_program = paddle.static.Program()
+            with paddle.static.scope_guard(new_scope):
+                with paddle.static.program_guard(main_program):
+                    x1 = paddle.rand(shape=[3, 3], dtype="float32")
+                    x1.stop_gradient = False
+                    weight = paddle.full(
+                        shape=[3, 3], fill_value="0.5", dtype="float32"
+                    )
+                    y = paddle.nn.functional.linear(x1, weight)
+                    y.stop_gradient = True
+                    out1 = paddle.concat(x=[x1, y], axis=1)
+                    out2 = paddle.mean(out1)
+                    sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.1)
+                    sgd_optimizer.minimize(out2)
+            _ = pir.translate_to_pir(main_program.desc)
+
+
+class TestOneHotOpTranscriber(unittest.TestCase):
+    def test_mutable_attribute(self):
+        with paddle.pir_utils.OldIrGuard():
+            place = core.Place()
+            place.set_place(paddle.CPUPlace())
+            new_scope = paddle.static.Scope()
+            main_program = paddle.static.Program()
+            with paddle.static.scope_guard(new_scope):
+                with paddle.static.program_guard(main_program):
+                    depth = paddle.assign(np.array([10], dtype=np.int32))
+                    label = paddle.static.data(
+                        name="label", shape=[-1, 1], dtype="int64"
+                    )
+                    one_hot_label = paddle.nn.functional.one_hot(
+                        x=label, num_classes=depth
+                    )
+
+            _ = pir.translate_to_pir(main_program.desc)
+
+    def test_normal_attribute(self):
+        with paddle.pir_utils.OldIrGuard():
+            place = core.Place()
+            place.set_place(paddle.CPUPlace())
+            new_scope = paddle.static.Scope()
+            main_program = paddle.static.Program()
+            with paddle.static.scope_guard(new_scope):
+                with paddle.static.program_guard(main_program):
+                    depth = 10
+                    label = paddle.static.data(
+                        name="label", shape=[-1, 1], dtype="int64"
+                    )
+                    one_hot_label = paddle.nn.functional.one_hot(
+                        x=label, num_classes=depth
+                    )
+
+            _ = pir.translate_to_pir(main_program.desc)
+
+
+class TestReduceOpTranscriber(unittest.TestCase):
+    def test_reduce_all(self):
+        place = core.Place()
+        place.set_place(paddle.CPUPlace())
+        exe = paddle.static.Executor(place)
+
+        new_scope = paddle.static.Scope()
+        main_program = paddle.static.Program()
+        with paddle.static.scope_guard(new_scope):
+            with paddle.static.program_guard(main_program):
+                arr = np.ones([2, 2], dtype="float32")
+                x = paddle.to_tensor(arr, dtype='int32')
+                out1 = paddle.all(x)
+
+                out = exe.run(main_program, {}, fetch_list=[out1])
+                np.testing.assert_array_equal(out[0], np.all(arr))
+
+    def test_with_axis(self):
+        place = core.Place()
+        place.set_place(paddle.CPUPlace())
+        exe = paddle.static.Executor(place)
+
+        new_scope = paddle.static.Scope()
+        main_program = paddle.static.Program()
+        with paddle.static.scope_guard(new_scope):
+            with paddle.static.program_guard(main_program):
+                arr = np.ones([2, 2], dtype="float32")
+                x = paddle.to_tensor(arr, dtype='int32')
+                out1 = paddle.all(x, axis=0)
+
+                out = exe.run(main_program, {}, fetch_list=[out1])
+                np.testing.assert_array_equal(out[0], np.all(arr, axis=0))
+
+
+class TestIndexPutOpTranscriber(unittest.TestCase):
+    def test_op(self):
+        with paddle.pir_utils.OldIrGuard():
+            place = core.Place()
+            place.set_place(paddle.CPUPlace())
+            new_scope = paddle.static.Scope()
+            main_program = paddle.static.Program()
+            with paddle.static.scope_guard(new_scope):
+                with paddle.static.program_guard(main_program):
+                    x = paddle.randn([2, 3])
+                    indices = [
+                        paddle.randint(0, 2, [2]),
+                        paddle.randint(0, 1, [2]),
+                    ]
+                    value = paddle.randn([2])
+                    y = paddle.index_put(x, indices, value, False)
+
+            _ = pir.translate_to_pir(main_program.desc)
+
+
+class TestGradAddOpTranscriber(unittest.TestCase):
+    def test_op(self):
+        with paddle.pir_utils.OldIrGuard():
+            place = core.Place()
+            place.set_place(paddle.CPUPlace())
+            new_scope = paddle.static.Scope()
+            main_program = paddle.static.Program()
+            with paddle.static.scope_guard(new_scope):
+                with paddle.static.program_guard(main_program):
+                    x_data = np.random.rand(100, 2, 3)
+                    y_data = np.random.rand(100, 1, 1)
+                    x = paddle.to_tensor(x_data, dtype='float32')
+                    x.stop_gradient = False
+                    y = paddle.to_tensor(y_data, dtype='float32')
+
+                    helper = LayerHelper('grad_add')
+                    out = helper.create_variable_for_type_inference("float")
+                    helper.append_op(
+                        type="grad_add",
+                        inputs={"X": x, "Y": y},
+                        outputs={"Out": out},
+                        attrs={"axis": -1},
+                    )
+
+            _ = pir.translate_to_pir(main_program.desc)
+
+
+class TestShadowOutputSlice(unittest.TestCase):
+    def test_op(self):
+        with paddle.pir_utils.OldIrGuard():
+            place = core.Place()
+            place.set_place(paddle.CPUPlace())
+            new_scope = paddle.static.Scope()
+            main_program = paddle.static.Program()
+            with paddle.static.scope_guard(new_scope):
+                with paddle.static.program_guard(main_program):
+                    x = paddle.rand([3, 9, 5])
+                    y = paddle.static.data(
+                        name="y", shape=[3, 9, 5], dtype="float32"
+                    )
+
+                    _, out, _ = paddle.split(x, num_or_sections=3, axis=1)
+                    helper = LayerHelper('shadow_output')
+                    helper.append_op(
+                        type="shadow_output",
+                        inputs={"x": [out.name]},
+                        outputs={"out": [y.name]},
+                        attrs={"name": out.name},
+                    )
+
+            l = pir.translate_to_pir(main_program.desc)
+
+
+class TestSetValueOp(unittest.TestCase):
+    def test_no_mutable_attribute(self):
+        place = core.Place()
+        place.set_place(paddle.CPUPlace())
+        exe = paddle.static.Executor(place)
+
+        new_scope = paddle.static.Scope()
+        main_program = paddle.static.Program()
+        with paddle.static.scope_guard(new_scope):
+            with paddle.static.program_guard(main_program):
+                x = paddle.ones(shape=[2, 3, 4], dtype="float32")
+                x = paddle.static.setitem(x, (0, 0), 6)
+        ret = exe.run(main_program, fetch_list=[x])
+
+        x_data = np.ones([2, 3, 4]).astype("float32")
+        x_data[0, 0] = 6
+        np.testing.assert_array_equal(ret[0], x_data)
+
+    def test_with_mutable_attribute(self):
+        place = core.Place()
+        place.set_place(paddle.CPUPlace())
+        exe = paddle.static.Executor(place)
+
+        new_scope = paddle.static.Scope()
+        main_program = paddle.static.Program()
+        with paddle.static.scope_guard(new_scope):
+            with paddle.static.program_guard(main_program):
+                x = paddle.ones(shape=[2, 3, 4], dtype="float32")
+                zero = paddle.full([], 0, dtype="int32")
+                x = paddle.static.setitem(x, zero, 6)
+        ret = exe.run(main_program, fetch_list=[x])
+
+        x_data = np.ones([2, 3, 4]).astype("float32")
+        x_data[0] = 6
+        np.testing.assert_array_equal(ret[0], x_data)
+
+    def test_grad(self):
+        with paddle.pir_utils.OldIrGuard():
+            place = core.Place()
+            place.set_place(paddle.CPUPlace())
+            exe = paddle.static.Executor(place)
+            new_scope = paddle.static.Scope()
+            main_program = paddle.static.Program()
+            input_shape = [7, 6, 5, 4, 3, 2]
+            with paddle.static.scope_guard(new_scope):
+                with paddle.static.program_guard(main_program):
+                    x = paddle.ones(shape=input_shape, dtype="float32")
+                    value = paddle.tensor.fill_constant([1, 3, 2], "float32", 1)
+                    # test stop_gradient
+                    value.stop_gradient = False
+                    x.stop_gradient = False
+                    attrs = {
+                        'axes': [0],
+                        'starts': [6],
+                        'ends': [0],
+                        'steps': [-4],
+                        'decrease_axes': [],
+                        'none_axes': [],
+                        'dtype': paddle.float32,
+                    }
+                    inputs = {'Input': x, 'ValueTensor': value}
+
+                    helper = LayerHelper("set_value")
+                    y = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+                    helper.append_op(
+                        type="set_value",
+                        inputs=inputs,
+                        outputs={'Out': y},
+                        attrs=attrs,
+                    )
+                    y2 = y + 1
+                    loss = paddle.sum(y2)
+                    opt = paddle.optimizer.Adam()
+                    opt.minimize(loss)
+
+                    x_data = np.arange(
+                        0, np.prod(input_shape), dtype="float32"
+                    ).reshape(input_shape)
+                    fetch_list = [x.grad_name, value.grad_name]
+                    ret = exe.run(main_program, fetch_list=fetch_list)
+                    self.assertTrue((ret[0][6:0:-4] == 0).all())
+
+
+class TestShareBufferOpTranscriber(unittest.TestCase):
+    def test_program(self):
+        with paddle.pir_utils.OldIrGuard():
+            place = core.Place()
+            place.set_place(paddle.CPUPlace())
+
+            new_scope = paddle.static.Scope()
+            main_program = paddle.static.Program()
+            with paddle.static.scope_guard(new_scope):
+                with paddle.static.program_guard(main_program):
+                    x = paddle.ones(shape=(100, 2, 3), dtype='float32')
+                    y = paddle.ones(shape=(100, 2, 3), dtype='float32')
+
+                    helper = LayerHelper('share_buffer')
+                    helper.append_op(
+                        type="share_buffer",
+                        inputs={"X": x},
+                        outputs={"Out": y, "XOut": x},
+                    )
+            l = pir.translate_to_pir(main_program.desc)
+            assert (
+                l.global_block().ops[2].name() == "pd_op.share_data_"
+            ), "share_buffer should be translated to share_data_"
+
+
+class TestDataOp(unittest.TestCase):
+    def test_data_op(self):
+        with paddle.pir_utils.OldIrGuard():
+            place = core.Place()
+            place.set_place(paddle.CPUPlace())
+
+            new_scope = paddle.static.Scope()
+            main_program = paddle.static.Program()
+            with paddle.static.scope_guard(new_scope):
+                with paddle.static.program_guard(main_program):
+                    _ = paddle.static.data(
+                        name="y", shape=[3, 9, 5], dtype="int64"
+                    )
+            l = pir.translate_to_pir(main_program.desc)
+            self.assertTrue(len(l.global_block().ops) > 0)
+            self.assertTrue(l.global_block().ops[0].name() == "pd_op.data")
+            data_op = l.global_block().ops[0]
+            self.assertIn("dtype", data_op.attrs())
+            self.assertEqual(str(data_op.attrs()["dtype"]), "paddle.int64")
+
+
+class TestCheckUnregisteredOp(unittest.TestCase):
+    def test_program(self):
+        with paddle.pir_utils.OldIrGuard():
+            main_program = paddle.static.Program()
+            with paddle.static.program_guard(main_program):
+                x = paddle.randn((4, 16))
+                prev_h = paddle.randn((4, 32))
+
+                cell = paddle.nn.SimpleRNNCell(16, 32)
+                y, h = cell(x, prev_h)
+
+            ops = pir.check_unregistered_ops(main_program.desc)
+            assert len(ops) == 0
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/deprecated/ir/pir/test_standalone_pir.py b/test/ir/pir/test_standalone_pir.py
similarity index 91%
rename from test/deprecated/ir/pir/test_standalone_pir.py
rename to test/ir/pir/test_standalone_pir.py
index 6104cf533baa8..73c36afedb548 100644
--- a/test/deprecated/ir/pir/test_standalone_pir.py
+++ b/test/ir/pir/test_standalone_pir.py
@@ -296,36 +296,36 @@ def tearDown(self):
         self.temp_dir.cleanup()
 
     def test_with_pir(self):
-        paddle.disable_static()
-
-        linear = paddle.nn.Linear(10, 10)
-        path = os.path.join(self.model_path, "linear")
-
-        paddle.jit.save(
-            linear,
-            path,
-            input_spec=[paddle.static.InputSpec([10, 10], 'float32', 'x')],
-        )
-
-        paddle.enable_static()
-        place = (
-            paddle.CUDAPlace(0)
-            if paddle.is_compiled_with_cuda()
-            else paddle.CPUPlace()
-        )
+        with paddle.pir_utils.OldIrGuard():
+            paddle.disable_static()
+            linear = paddle.nn.Linear(10, 10)
+            path = os.path.join(self.model_path, "linear")
+
+            paddle.jit.save(
+                linear,
+                path,
+                input_spec=[paddle.static.InputSpec([10, 10], 'float32', 'x')],
+            )
 
-        exe = paddle.static.Executor(place)
+            paddle.enable_static()
+            place = (
+                paddle.CUDAPlace(0)
+                if paddle.is_compiled_with_cuda()
+                else paddle.CPUPlace()
+            )
 
-        [
-            inference_program,
-            feed_target_names,
-            fetch_targets,
-        ] = paddle.static.io.load_inference_model(
-            self.model_path,
-            executor=exe,
-            model_filename="linear.pdmodel",
-            params_filename="linear.pdiparams",
-        )
+            exe = paddle.static.Executor(place)
+
+            [
+                inference_program,
+                feed_target_names,
+                fetch_targets,
+            ] = paddle.static.io.load_inference_model(
+                self.model_path,
+                executor=exe,
+                model_filename="linear.pdmodel",
+                params_filename="linear.pdiparams",
+            )
 
 
 class TestPirConcatDygraph(unittest.TestCase):
diff --git a/test/deprecated/ir/pir/translator/CMakeLists.txt b/test/ir/pir/translator/CMakeLists.txt
similarity index 100%
rename from test/deprecated/ir/pir/translator/CMakeLists.txt
rename to test/ir/pir/translator/CMakeLists.txt
diff --git a/test/deprecated/ir/pir/translator/test_all_reduce_translator.py b/test/ir/pir/translator/test_all_reduce_translator.py
similarity index 97%
rename from test/deprecated/ir/pir/translator/test_all_reduce_translator.py
rename to test/ir/pir/translator/test_all_reduce_translator.py
index 3bef81873428a..017c8b4c90e50 100644
--- a/test/deprecated/ir/pir/translator/test_all_reduce_translator.py
+++ b/test/ir/pir/translator/test_all_reduce_translator.py
@@ -19,6 +19,8 @@
 import paddle
 from paddle.base.layer_helper import LayerHelper
 
+paddle.pir_utils._switch_to_old_ir_()
+
 
 class TestCAllReduceMinOpTranslator(test_op_translator.TestOpTranslator):
     def append_op(self):
diff --git a/test/deprecated/ir/pir/translator/test_barrier_translator.py b/test/ir/pir/translator/test_barrier_translator.py
similarity index 97%
rename from test/deprecated/ir/pir/translator/test_barrier_translator.py
rename to test/ir/pir/translator/test_barrier_translator.py
index 7d570df843081..60bf1ed57da17 100644
--- a/test/deprecated/ir/pir/translator/test_barrier_translator.py
+++ b/test/ir/pir/translator/test_barrier_translator.py
@@ -19,6 +19,8 @@
 import paddle
 from paddle.base.layer_helper import LayerHelper
 
+paddle.pir_utils._switch_to_old_ir_()
+
 
 class TestBarrierOpTranslator(test_op_translator.TestOpTranslator):
     def append_op(self):
diff --git a/test/deprecated/ir/pir/translator/test_c_allreduce_min_translator.py b/test/ir/pir/translator/test_c_allreduce_min_translator.py
similarity index 97%
rename from test/deprecated/ir/pir/translator/test_c_allreduce_min_translator.py
rename to test/ir/pir/translator/test_c_allreduce_min_translator.py
index 60549a63ec6e4..dfb628936001e 100644
--- a/test/deprecated/ir/pir/translator/test_c_allreduce_min_translator.py
+++ b/test/ir/pir/translator/test_c_allreduce_min_translator.py
@@ -19,6 +19,8 @@
 import paddle
 from paddle.base.layer_helper import LayerHelper
 
+paddle.pir_utils._switch_to_old_ir_()
+
 
 class TestCAllReduceMinOpTranslator(test_op_translator.TestOpTranslator):
     def append_op(self):
diff --git a/test/deprecated/ir/pir/translator/test_c_allreduce_prod_translator.py b/test/ir/pir/translator/test_c_allreduce_prod_translator.py
similarity index 97%
rename from test/deprecated/ir/pir/translator/test_c_allreduce_prod_translator.py
rename to test/ir/pir/translator/test_c_allreduce_prod_translator.py
index 855f2e5f7293b..f803b7cabaf51 100644
--- a/test/deprecated/ir/pir/translator/test_c_allreduce_prod_translator.py
+++ b/test/ir/pir/translator/test_c_allreduce_prod_translator.py
@@ -19,6 +19,8 @@
 import paddle
 from paddle.base.layer_helper import LayerHelper
 
+paddle.pir_utils._switch_to_old_ir_()
+
 
 class TestCAllReduceProdOpTranslator(test_op_translator.TestOpTranslator):
     def append_op(self):
diff --git a/test/deprecated/ir/pir/translator/test_c_reduce_max_translator.py b/test/ir/pir/translator/test_c_reduce_max_translator.py
similarity index 97%
rename from test/deprecated/ir/pir/translator/test_c_reduce_max_translator.py
rename to test/ir/pir/translator/test_c_reduce_max_translator.py
index c40624ad74fbb..700135b619b6a 100644
--- a/test/deprecated/ir/pir/translator/test_c_reduce_max_translator.py
+++ b/test/ir/pir/translator/test_c_reduce_max_translator.py
@@ -19,6 +19,8 @@
 import paddle
 from paddle.base.layer_helper import LayerHelper
 
+paddle.pir_utils._switch_to_old_ir_()
+
 
 class TestCReduceMaxOpTranslator(test_op_translator.TestOpTranslator):
     def append_op(self):
diff --git a/test/deprecated/ir/pir/translator/test_c_reduce_min_translator.py b/test/ir/pir/translator/test_c_reduce_min_translator.py
similarity index 97%
rename from test/deprecated/ir/pir/translator/test_c_reduce_min_translator.py
rename to test/ir/pir/translator/test_c_reduce_min_translator.py
index 71610cf9a3e43..bb77cd649b16b 100644
--- a/test/deprecated/ir/pir/translator/test_c_reduce_min_translator.py
+++ b/test/ir/pir/translator/test_c_reduce_min_translator.py
@@ -19,6 +19,8 @@
 import paddle
 from paddle.base.layer_helper import LayerHelper
 
+paddle.pir_utils._switch_to_old_ir_()
+
 
 class TestCReduceMinOpTranslator(test_op_translator.TestOpTranslator):
     def append_op(self):
diff --git a/test/deprecated/ir/pir/translator/test_c_reduce_prod_translator.py b/test/ir/pir/translator/test_c_reduce_prod_translator.py
similarity index 97%
rename from test/deprecated/ir/pir/translator/test_c_reduce_prod_translator.py
rename to test/ir/pir/translator/test_c_reduce_prod_translator.py
index 34caa22d77b9f..ac1553bf92b7c 100644
--- a/test/deprecated/ir/pir/translator/test_c_reduce_prod_translator.py
+++ b/test/ir/pir/translator/test_c_reduce_prod_translator.py
@@ -19,6 +19,8 @@
 import paddle
 from paddle.base.layer_helper import LayerHelper
 
+paddle.pir_utils._switch_to_old_ir_()
+
 
 class TestCReduceProdOpTranslator(test_op_translator.TestOpTranslator):
     def append_op(self):
diff --git a/test/deprecated/ir/pir/translator/test_c_scatter_translator.py b/test/ir/pir/translator/test_c_scatter_translator.py
similarity index 97%
rename from test/deprecated/ir/pir/translator/test_c_scatter_translator.py
rename to test/ir/pir/translator/test_c_scatter_translator.py
index 66dbb3320ab43..79b8c24eb7911 100644
--- a/test/deprecated/ir/pir/translator/test_c_scatter_translator.py
+++ b/test/ir/pir/translator/test_c_scatter_translator.py
@@ -19,6 +19,8 @@
 import paddle
 from paddle.base.layer_helper import LayerHelper
 
+paddle.pir_utils._switch_to_old_ir_()
+
 
 class TestCScatterOpTranslator(test_op_translator.TestOpTranslator):
     def append_op(self):
diff --git a/test/deprecated/ir/pir/translator/test_c_split_translator.py b/test/ir/pir/translator/test_c_split_translator.py
similarity index 97%
rename from test/deprecated/ir/pir/translator/test_c_split_translator.py
rename to test/ir/pir/translator/test_c_split_translator.py
index e09194e9ca019..9fe3df6d3560d 100644
--- a/test/deprecated/ir/pir/translator/test_c_split_translator.py
+++ b/test/ir/pir/translator/test_c_split_translator.py
@@ -19,6 +19,8 @@
 import paddle
 from paddle.base.layer_helper import LayerHelper
 
+paddle.pir_utils._switch_to_old_ir_()
+
 
 class TestCSplitOpTranslator(test_op_translator.TestOpTranslator):
     def append_op(self):
diff --git a/test/deprecated/ir/pir/translator/test_dgc_momentum_translator.py b/test/ir/pir/translator/test_dgc_momentum_translator.py
similarity index 98%
rename from test/deprecated/ir/pir/translator/test_dgc_momentum_translator.py
rename to test/ir/pir/translator/test_dgc_momentum_translator.py
index b44b981ddc6cb..75a62c22e7f57 100644
--- a/test/deprecated/ir/pir/translator/test_dgc_momentum_translator.py
+++ b/test/ir/pir/translator/test_dgc_momentum_translator.py
@@ -19,6 +19,8 @@
 import paddle
 from paddle.base.layer_helper import LayerHelper
 
+paddle.pir_utils._switch_to_old_ir_()
+
 
 class TestDgcMomemtumOpTranslator(test_op_translator.TestOpTranslator):
     def append_op(self):
diff --git a/test/deprecated/ir/pir/translator/test_dgc_translator.py b/test/ir/pir/translator/test_dgc_translator.py
similarity index 98%
rename from test/deprecated/ir/pir/translator/test_dgc_translator.py
rename to test/ir/pir/translator/test_dgc_translator.py
index 6f2fe03137eb9..87d72c7afafcb 100644
--- a/test/deprecated/ir/pir/translator/test_dgc_translator.py
+++ b/test/ir/pir/translator/test_dgc_translator.py
@@ -19,6 +19,8 @@
 import paddle
 from paddle.base.layer_helper import LayerHelper
 
+paddle.pir_utils._switch_to_old_ir_()
+
 
 class TestDgcOpTranslator(test_op_translator.TestOpTranslator):
     def append_op(self):
diff --git a/test/deprecated/ir/pir/translator/test_distributed_fused_lamb.py b/test/ir/pir/translator/test_distributed_fused_lamb.py
similarity index 99%
rename from test/deprecated/ir/pir/translator/test_distributed_fused_lamb.py
rename to test/ir/pir/translator/test_distributed_fused_lamb.py
index 9493772d63799..4e03fd93082b3 100644
--- a/test/deprecated/ir/pir/translator/test_distributed_fused_lamb.py
+++ b/test/ir/pir/translator/test_distributed_fused_lamb.py
@@ -20,6 +20,8 @@
 from paddle.base import core, unique_name
 from paddle.base.layer_helper import LayerHelper
 
+paddle.pir_utils._switch_to_old_ir_()
+
 
 class TestDistributedFusedLambOpTranslator(test_op_translator.TestOpTranslator):
     def setUp(self):
diff --git a/test/deprecated/ir/pir/translator/test_distributed_fused_lamb_init.py b/test/ir/pir/translator/test_distributed_fused_lamb_init.py
similarity index 99%
rename from test/deprecated/ir/pir/translator/test_distributed_fused_lamb_init.py
rename to test/ir/pir/translator/test_distributed_fused_lamb_init.py
index 618c526830d5b..8faa4a33209c9 100644
--- a/test/deprecated/ir/pir/translator/test_distributed_fused_lamb_init.py
+++ b/test/ir/pir/translator/test_distributed_fused_lamb_init.py
@@ -20,6 +20,8 @@
 from paddle.base import unique_name
 from paddle.base.layer_helper import LayerHelper
 
+paddle.pir_utils._switch_to_old_ir_()
+
 
 class TestDistributedFusedLambInitOpTranslator(
     test_op_translator.TestOpTranslator
diff --git a/test/deprecated/ir/pir/translator/test_distributed_lookup_table_translate.py b/test/ir/pir/translator/test_distributed_lookup_table_translate.py
similarity index 97%
rename from test/deprecated/ir/pir/translator/test_distributed_lookup_table_translate.py
rename to test/ir/pir/translator/test_distributed_lookup_table_translate.py
index ead69d9dcbbf0..e596432748779 100644
--- a/test/deprecated/ir/pir/translator/test_distributed_lookup_table_translate.py
+++ b/test/ir/pir/translator/test_distributed_lookup_table_translate.py
@@ -23,6 +23,8 @@
 )
 from paddle.base.layer_helper import LayerHelper
 
+paddle.pir_utils._switch_to_old_ir_()
+
 
 class TestDistributedLookupTableOpTranslator(
     test_op_translator.TestOpTranslator
diff --git a/test/deprecated/ir/pir/translator/test_distributed_push_sparse_translator.py b/test/ir/pir/translator/test_distributed_push_sparse_translator.py
similarity index 97%
rename from test/deprecated/ir/pir/translator/test_distributed_push_sparse_translator.py
rename to test/ir/pir/translator/test_distributed_push_sparse_translator.py
index 996a48f99ec4d..d9bada6c0baeb 100644
--- a/test/deprecated/ir/pir/translator/test_distributed_push_sparse_translator.py
+++ b/test/ir/pir/translator/test_distributed_push_sparse_translator.py
@@ -23,6 +23,8 @@
 )
 from paddle.base.layer_helper import LayerHelper
 
+paddle.pir_utils._switch_to_old_ir_()
+
 
 class TestDistributedPushSparseOpTranslator(
     test_op_translator.TestOpTranslator
diff --git a/test/deprecated/ir/pir/translator/test_global_gather_translator.py b/test/ir/pir/translator/test_global_gather_translator.py
similarity index 97%
rename from test/deprecated/ir/pir/translator/test_global_gather_translator.py
rename to test/ir/pir/translator/test_global_gather_translator.py
index cbd883aaf6500..83afd6b103442 100644
--- a/test/deprecated/ir/pir/translator/test_global_gather_translator.py
+++ b/test/ir/pir/translator/test_global_gather_translator.py
@@ -19,6 +19,8 @@
 import paddle
 from paddle.base.layer_helper import LayerHelper
 
+paddle.pir_utils._switch_to_old_ir_()
+
 
 class TestGlobalGatherOpTranslator(
     test_op_translator.TestOpWithBackwardTranslator
diff --git a/test/deprecated/ir/pir/translator/test_global_scatter_translator.py b/test/ir/pir/translator/test_global_scatter_translator.py
similarity index 97%
rename from test/deprecated/ir/pir/translator/test_global_scatter_translator.py
rename to test/ir/pir/translator/test_global_scatter_translator.py
index fb349a30b95e2..3ea1c4fdc87ea 100644
--- a/test/deprecated/ir/pir/translator/test_global_scatter_translator.py
+++ b/test/ir/pir/translator/test_global_scatter_translator.py
@@ -19,6 +19,8 @@
 import paddle
 from paddle.base.layer_helper import LayerHelper
 
+paddle.pir_utils._switch_to_old_ir_()
+
 
 class TestGlobalScatterOpTranslator(
     test_op_translator.TestOpWithBackwardTranslator
diff --git a/test/deprecated/ir/pir/translator/test_limit_by_capacity_translator.py b/test/ir/pir/translator/test_limit_by_capacity_translator.py
similarity index 97%
rename from test/deprecated/ir/pir/translator/test_limit_by_capacity_translator.py
rename to test/ir/pir/translator/test_limit_by_capacity_translator.py
index 82739201c3dd9..25a375a297709 100644
--- a/test/deprecated/ir/pir/translator/test_limit_by_capacity_translator.py
+++ b/test/ir/pir/translator/test_limit_by_capacity_translator.py
@@ -19,6 +19,8 @@
 import paddle
 from paddle.base.layer_helper import LayerHelper
 
+paddle.pir_utils._switch_to_old_ir_()
+
 
 class TestDistributedLookupTableOpTranslator(
     test_op_translator.TestOpTranslator
diff --git a/test/deprecated/ir/pir/translator/test_nop_translator.py b/test/ir/pir/translator/test_nop_translator.py
similarity index 96%
rename from test/deprecated/ir/pir/translator/test_nop_translator.py
rename to test/ir/pir/translator/test_nop_translator.py
index e3a7722cd8354..f45ada523ec1a 100644
--- a/test/deprecated/ir/pir/translator/test_nop_translator.py
+++ b/test/ir/pir/translator/test_nop_translator.py
@@ -19,6 +19,8 @@
 import paddle
 from paddle.base.layer_helper import LayerHelper
 
+paddle.pir_utils._switch_to_old_ir_()
+
 
 class TestNopTranslator(test_op_translator.TestOpTranslator):
     def append_op(self):
diff --git a/test/deprecated/ir/pir/translator/test_op_translator.py b/test/ir/pir/translator/test_op_translator.py
similarity index 98%
rename from test/deprecated/ir/pir/translator/test_op_translator.py
rename to test/ir/pir/translator/test_op_translator.py
index 7ec1f0dd8d380..775677ea397ff 100644
--- a/test/deprecated/ir/pir/translator/test_op_translator.py
+++ b/test/ir/pir/translator/test_op_translator.py
@@ -21,6 +21,8 @@
 
 paddle.enable_static()
 
+paddle.pir_utils._switch_to_old_ir_()
+
 
 class TestOpTranslator(unittest.TestCase):
     def setUp(self):
diff --git a/test/deprecated/ir/pir/translator/test_partial_allgather_translator.py b/test/ir/pir/translator/test_partial_allgather_translator.py
similarity index 97%
rename from test/deprecated/ir/pir/translator/test_partial_allgather_translator.py
rename to test/ir/pir/translator/test_partial_allgather_translator.py
index 37c19e2105066..e4a1653137fb7 100644
--- a/test/deprecated/ir/pir/translator/test_partial_allgather_translator.py
+++ b/test/ir/pir/translator/test_partial_allgather_translator.py
@@ -19,6 +19,8 @@
 import paddle
 from paddle.base.layer_helper import LayerHelper
 
+paddle.pir_utils._switch_to_old_ir_()
+
 
 class TestPartialAllgetherOpTranslator(test_op_translator.TestOpTranslator):
     def append_op(self):
diff --git a/test/deprecated/ir/pir/translator/test_partial_recv_translator.py b/test/ir/pir/translator/test_partial_recv_translator.py
similarity index 97%
rename from test/deprecated/ir/pir/translator/test_partial_recv_translator.py
rename to test/ir/pir/translator/test_partial_recv_translator.py
index 6f06ec4fad073..953e6d9ed2f13 100644
--- a/test/deprecated/ir/pir/translator/test_partial_recv_translator.py
+++ b/test/ir/pir/translator/test_partial_recv_translator.py
@@ -23,6 +23,8 @@
 )
 from paddle.base.layer_helper import LayerHelper
 
+paddle.pir_utils._switch_to_old_ir_()
+
 
 class TestPartialRecvOpTranslator(test_op_translator.TestOpTranslator):
     def append_op(self):
diff --git a/test/deprecated/ir/pir/translator/test_partial_send_translator.py b/test/ir/pir/translator/test_partial_send_translator.py
similarity index 96%
rename from test/deprecated/ir/pir/translator/test_partial_send_translator.py
rename to test/ir/pir/translator/test_partial_send_translator.py
index 9f133f5274969..36c275480c2bc 100644
--- a/test/deprecated/ir/pir/translator/test_partial_send_translator.py
+++ b/test/ir/pir/translator/test_partial_send_translator.py
@@ -19,6 +19,8 @@
 import paddle
 from paddle.base.layer_helper import LayerHelper
 
+paddle.pir_utils._switch_to_old_ir_()
+
 
 class TestPartialSendTranslator(test_op_translator.TestOpTranslator):
     def append_op(self):
diff --git a/test/deprecated/ir/pir/translator/test_prune_gate_by_capacity_translator.py b/test/ir/pir/translator/test_prune_gate_by_capacity_translator.py
similarity index 97%
rename from test/deprecated/ir/pir/translator/test_prune_gate_by_capacity_translator.py
rename to test/ir/pir/translator/test_prune_gate_by_capacity_translator.py
index 637429bfa70b7..0ce278f77b90e 100644
--- a/test/deprecated/ir/pir/translator/test_prune_gate_by_capacity_translator.py
+++ b/test/ir/pir/translator/test_prune_gate_by_capacity_translator.py
@@ -19,6 +19,8 @@
 import paddle
 from paddle.base.layer_helper import LayerHelper
 
+paddle.pir_utils._switch_to_old_ir_()
+
 
 class TestPruneGateByCapacityOpTranslator(test_op_translator.TestOpTranslator):
     def append_op(self):
diff --git a/test/ir/pir/translator/test_pull_box_sparse_translator.py b/test/ir/pir/translator/test_pull_box_sparse_translator.py
index f691892adc4f4..85fcfcb909567 100644
--- a/test/ir/pir/translator/test_pull_box_sparse_translator.py
+++ b/test/ir/pir/translator/test_pull_box_sparse_translator.py
@@ -19,6 +19,8 @@
 import paddle
 from paddle.base.layer_helper import LayerHelper
 
+paddle.pir_utils._switch_to_old_ir_()
+
 
 class TestPullBoxSparseOpTranslator(
     test_op_translator.TestOpWithBackwardTranslator
diff --git a/test/deprecated/ir/pir/translator/test_pull_gpups_sparse_translator.py b/test/ir/pir/translator/test_pull_gpups_sparse_translator.py
similarity index 97%
rename from test/deprecated/ir/pir/translator/test_pull_gpups_sparse_translator.py
rename to test/ir/pir/translator/test_pull_gpups_sparse_translator.py
index abc695a0573a2..c55a9b6eb6f3a 100644
--- a/test/deprecated/ir/pir/translator/test_pull_gpups_sparse_translator.py
+++ b/test/ir/pir/translator/test_pull_gpups_sparse_translator.py
@@ -20,6 +20,8 @@
 from paddle.base import core
 from paddle.base.layer_helper import LayerHelper
 
+paddle.pir_utils._switch_to_old_ir_()
+
 
 class TestPullGpupsSparseOpTranslator(
     test_op_translator.TestOpWithBackwardTranslator
diff --git a/test/deprecated/ir/pir/translator/test_pull_sparse_v2_translator.py b/test/ir/pir/translator/test_pull_sparse_v2_translator.py
similarity index 97%
rename from test/deprecated/ir/pir/translator/test_pull_sparse_v2_translator.py
rename to test/ir/pir/translator/test_pull_sparse_v2_translator.py
index 374c7f5ee2e61..f91bb3ccc2f90 100644
--- a/test/deprecated/ir/pir/translator/test_pull_sparse_v2_translator.py
+++ b/test/ir/pir/translator/test_pull_sparse_v2_translator.py
@@ -20,6 +20,8 @@
 from paddle.base import core
 from paddle.base.layer_helper import LayerHelper
 
+paddle.pir_utils._switch_to_old_ir_()
+
 
 class TestPullSparseV2OpTranslator(
     test_op_translator.TestOpWithBackwardTranslator
diff --git a/test/deprecated/ir/pir/translator/test_push_dense_translator.py b/test/ir/pir/translator/test_push_dense_translator.py
similarity index 97%
rename from test/deprecated/ir/pir/translator/test_push_dense_translator.py
rename to test/ir/pir/translator/test_push_dense_translator.py
index cdd87ba72d3ed..26191a0b2d048 100644
--- a/test/deprecated/ir/pir/translator/test_push_dense_translator.py
+++ b/test/ir/pir/translator/test_push_dense_translator.py
@@ -19,6 +19,8 @@
 import paddle
 from paddle.base.layer_helper import LayerHelper
 
+paddle.pir_utils._switch_to_old_ir_()
+
 
 class TestPushDenseOpTranslator(test_op_translator.TestOpTranslator):
     def append_op(self):
diff --git a/test/deprecated/ir/pir/translator/test_random_routing_translator.py b/test/ir/pir/translator/test_random_routing_translator.py
similarity index 97%
rename from test/deprecated/ir/pir/translator/test_random_routing_translator.py
rename to test/ir/pir/translator/test_random_routing_translator.py
index 86d047930f8b7..c8b353fd7e71f 100644
--- a/test/deprecated/ir/pir/translator/test_random_routing_translator.py
+++ b/test/ir/pir/translator/test_random_routing_translator.py
@@ -19,6 +19,8 @@
 import paddle
 from paddle.base.layer_helper import LayerHelper
 
+paddle.pir_utils._switch_to_old_ir_()
+
 
 class TestRandomRoutingOpTranslator(test_op_translator.TestOpTranslator):
     def append_op(self):
diff --git a/test/deprecated/ir/pir/translator/test_send_and_recv_translator.py b/test/ir/pir/translator/test_send_and_recv_translator.py
similarity index 97%
rename from test/deprecated/ir/pir/translator/test_send_and_recv_translator.py
rename to test/ir/pir/translator/test_send_and_recv_translator.py
index c452ae34eb7c7..e71d43c524ba9 100644
--- a/test/deprecated/ir/pir/translator/test_send_and_recv_translator.py
+++ b/test/ir/pir/translator/test_send_and_recv_translator.py
@@ -19,6 +19,8 @@
 import paddle
 from paddle.base.layer_helper import LayerHelper
 
+paddle.pir_utils._switch_to_old_ir_()
+
 
 class TestCReduceMinOpTranslator(test_op_translator.TestOpTranslator):
     def append_op(self):
diff --git a/test/ir/test_ir_fusion_group_pass.py b/test/ir/test_ir_fusion_group_pass.py
index 0637efb067f7e..56c723613e939 100644
--- a/test/ir/test_ir_fusion_group_pass.py
+++ b/test/ir/test_ir_fusion_group_pass.py
@@ -72,7 +72,7 @@ def _feed_random_data(self, feed_vars):
             elif var.dtype == paddle.float16:
                 dtype = "float16"
             else:
-                raise ValueError("Unsupported dtype %s" % var.dtype)
+                raise ValueError(f"Unsupported dtype {var.dtype}")
             feeds[var.name] = np.random.random(shape).astype(dtype)
         return feeds
 
diff --git a/test/legacy_test/CMakeLists.txt b/test/legacy_test/CMakeLists.txt
index 8c4cfe9113ab3..4b390ca18a3f1 100644
--- a/test/legacy_test/CMakeLists.txt
+++ b/test/legacy_test/CMakeLists.txt
@@ -155,6 +155,7 @@ if(WIN32)
   list(REMOVE_ITEM TEST_OPS test_fused_layernorm_op)
   list(REMOVE_ITEM TEST_OPS test_matmul_int8_op)
   list(REMOVE_ITEM TEST_OPS test_variable_length_memory_efficient_attention)
+  list(REMOVE_ITEM TEST_OPS test_ops_nms)
 endif()
 list(REMOVE_ITEM TEST_OPS test_checkpoint_saver)
 
@@ -421,14 +422,11 @@ function(parallel_bash_test_modules TARGET_NAME)
   endif()
 endfunction()
 
+list(REMOVE_ITEM TEST_OPS test_data_norm_op)
 list(REMOVE_ITEM TEST_OPS test_warpctc_op)
 list(REMOVE_ITEM TEST_OPS test_imperative_resnet)
 list(REMOVE_ITEM TEST_OPS test_imperative_resnet_sorted_gradient)
 list(REMOVE_ITEM TEST_OPS test_imperative_se_resnext)
-list(REMOVE_ITEM TEST_OPS test_parallel_executor_seresnext_base_cpu)
-list(REMOVE_ITEM TEST_OPS test_parallel_executor_seresnext_with_reduce_cpu)
-list(REMOVE_ITEM TEST_OPS
-     test_parallel_executor_seresnext_with_fuse_all_reduce_cpu)
 list(REMOVE_ITEM TEST_OPS test_async_ssa_graph_executor_mnist)
 list(REMOVE_ITEM TEST_OPS test_basic_gru_api)
 list(REMOVE_ITEM TEST_OPS test_basic_gru_unit_op)
@@ -437,6 +435,8 @@ list(REMOVE_ITEM TEST_OPS test_basic_lstm_unit_op)
 list(REMOVE_ITEM TEST_OPS test_fuse_bn_add_act_pass)
 list(REMOVE_ITEM TEST_OPS test_conv3d_transpose_op)
 list(REMOVE_ITEM TEST_OPS test_layers)
+list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op)
+list(REMOVE_ITEM TEST_OPS test_nearest_interp_op)
 
 # disable this unittest temporarily
 list(REMOVE_ITEM TEST_OPS test_imperative_data_loader_exception)
@@ -485,6 +485,8 @@ endif()
 # Some ops need to check results when gc is enabled
 # Currently, only ops that register NoNeedBufferVarsInference need to do this test
 set(TEST_OPS_WITH_GC
+    test_affine_channel_op
+    test_scatter_op
     test_concat_op
     test_elementwise_add_op
     test_lookup_table_op
@@ -571,6 +573,11 @@ if((WITH_GPU) AND (WITH_CUDNN_FRONTEND))
                   test_fused_dot_product_attention_op)
 endif()
 
+py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op ENVS
+                ${GC_ENVS})
+py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op ENVS
+                ${GC_ENVS})
+
 set_tests_properties(test_conv2d_op PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
 set_tests_properties(test_norm_nn_grad PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
 set_tests_properties(test_nn_grad PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
@@ -681,7 +688,7 @@ if(WITH_DISTRIBUTE)
     endif()
   endif()
 endif()
-
+py_test_modules(test_data_norm_op MODULES test_data_norm_op)
 py_test_modules(
   test_fuse_bn_add_act_pass
   MODULES
@@ -753,6 +760,7 @@ if(WITH_DISTRIBUTE)
 endif()
 
 # setting timeout value as 15S
+set_tests_properties(test_isin PROPERTIES TIMEOUT 30)
 set_tests_properties(test_binomial_op PROPERTIES TIMEOUT 30)
 set_tests_properties(test_run PROPERTIES TIMEOUT 120)
 set_tests_properties(test_sync_batch_norm_op PROPERTIES TIMEOUT 180)
@@ -788,12 +796,18 @@ if(WITH_NV_JETSON)
   set_tests_properties(test_norm_op PROPERTIES TIMEOUT 1200)
   set_tests_properties(test_batch_norm_op_prim_nchw PROPERTIES TIMEOUT 1500)
   set_tests_properties(test_batch_norm_op_prim_nhwc PROPERTIES TIMEOUT 1500)
+  set_tests_properties(test_pool3d_op PROPERTIES TIMEOUT 1500)
+  set_tests_properties(test_conv3d_transpose_part2_op PROPERTIES TIMEOUT 1200)
+  set_tests_properties(test_layer_norm_op PROPERTIES TIMEOUT 1500)
 else()
   set_tests_properties(test_concat_op PROPERTIES TIMEOUT 400)
   set_tests_properties(test_conv3d_op PROPERTIES TIMEOUT 120)
   set_tests_properties(test_norm_op PROPERTIES TIMEOUT 150)
   set_tests_properties(test_batch_norm_op_prim_nchw PROPERTIES TIMEOUT 250)
   set_tests_properties(test_batch_norm_op_prim_nhwc PROPERTIES TIMEOUT 250)
+  set_tests_properties(test_pool3d_op PROPERTIES TIMEOUT 150)
+  set_tests_properties(test_conv3d_transpose_part2_op PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_layer_norm_op PROPERTIES TIMEOUT 250)
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(test_conv3d_transpose_op MODULES test_conv3d_transpose_op
@@ -946,6 +960,10 @@ if(WITH_CUDNN_FRONTEND)
 endif()
 
 set(TEST_CINN_OPS
+    test_assign_op
+    test_layer_norm_op
+    test_atan2_op
+    test_cast_op
     test_stack_op
     test_activation_op
     test_fill_any_like_op
@@ -954,6 +972,22 @@ set(TEST_CINN_OPS
     test_elementwise_sub_op
     test_elementwise_div_op
     test_elementwise_max_op
+    test_elementwise_mul_op
+    test_elementwise_pow_op
+    test_expand_v2_op
+    test_flatten_contiguous_range_op
+    test_flip
+    test_full_like_op
+    test_top_k_op
+    test_top_k_v2_op
+    test_reshape_op
+    test_triangular_solve_op
+    test_split_op
+    test_scatter_op
+    test_reverse_op
+    test_roll_op
+    test_meshgrid_op
+    test_index_select_op
     test_mean_op
     test_clip_op
     test_gather_op
@@ -997,6 +1031,13 @@ set_tests_properties(
 # These UTs are to temporarily test static build for standalone_executor, will be removed after static build is enabled by default.
 set(STATIC_BUILD_TESTS
     test_adagrad_op
+    test_batch_norm_op
+    test_nce
+    test_layer_norm_op
+    test_eigh_op
+    test_matmul_op
+    test_matmul_v2_op
+    test_paddle_save_load_binary
     test_assign_pos_op
     test_bucketize_api
     test_c_embedding_op
@@ -1099,3 +1140,54 @@ set_pir_tests_properties()
 set_tests_properties(test_nadam_op PROPERTIES TIMEOUT 100)
 set_tests_properties(test_radam_op PROPERTIES TIMEOUT 100)
 set_tests_properties(test_nan_inf PROPERTIES TIMEOUT 120)
+set_tests_properties(test_sparse_mask_as_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_bicubic_interp_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_bilinear_interp_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_conv2d_op_depthwise_conv
+                     PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+set_tests_properties(test_conv2d_op_depthwise_conv PROPERTIES TIMEOUT 120)
+set_tests_properties(test_crop_tensor_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_cross_entropy2_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_cumprod_op PROPERTIES TIMEOUT 300)
+set_tests_properties(test_deformable_conv_v1_op PROPERTIES TIMEOUT 300)
+set_tests_properties(test_elementwise_mul_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_fractional_max_pool2d_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_graph_send_ue_recv_op PROPERTIES TIMEOUT 60)
+set_tests_properties(test_graph_send_uv_op PROPERTIES TIMEOUT 60)
+set_tests_properties(test_gru_op PROPERTIES TIMEOUT 200)
+set_tests_properties(test_gru_unit_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_imperative_star_gan_with_gradient_penalty
+                     PROPERTIES TIMEOUT 120)
+set_tests_properties(test_index_add_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_index_select_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_lstm_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_matmul_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_matmul_v2_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_nearest_interp_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_nearest_interp_v2_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_pad3d_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_paddle_save_load_binary PROPERTIES TIMEOUT 120)
+set_tests_properties(test_partial_concat_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_partial_sum_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_qr_op PROPERTIES TIMEOUT 60)
+set_tests_properties(test_row_conv_op PROPERTIES TIMEOUT 120)
+if(WIN32)
+  set_tests_properties(test_static_save_load_large PROPERTIES TIMEOUT 900)
+else()
+  set_tests_properties(test_static_save_load_large PROPERTIES TIMEOUT 600)
+endif()
+set_tests_properties(test_svd_op PROPERTIES TIMEOUT 80)
+set_tests_properties(test_trilinear_interp_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_trilinear_interp_v2_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_matmul_op_static_build PROPERTIES TIMEOUT 120)
+set_tests_properties(test_matmul_v2_op_static_build PROPERTIES TIMEOUT 120)
+set_tests_properties(test_paddle_save_load_binary_static_build
+                     PROPERTIES TIMEOUT 120)
+set_tests_properties(test_argsort_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_conv_nn_grad PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+set_tests_properties(test_conv_nn_grad PROPERTIES TIMEOUT 220)
+set_tests_properties(test_data_norm_op PROPERTIES LABELS "RUN_TYPE=DIST")
+set_tests_properties(test_deformable_conv_op PROPERTIES TIMEOUT 200)
+set_tests_properties(test_regularizer PROPERTIES TIMEOUT 150)
+set_tests_properties(test_regularizer_api PROPERTIES TIMEOUT 150)
+set_tests_properties(test_sgd_op PROPERTIES TIMEOUT 250)
diff --git a/test/legacy_test/dist_ctr_reader.py b/test/legacy_test/dist_ctr_reader.py
index 23f4daf2a5d8f..039d2c8aaf178 100644
--- a/test/legacy_test/dist_ctr_reader.py
+++ b/test/legacy_test/dist_ctr_reader.py
@@ -114,7 +114,7 @@ def train(self):
         Load trainset.
         '''
         file_name = "train.txt"
-        logger.info("load trainset from %s" % file_name)
+        logger.info(f"load trainset from {file_name}")
         mode = TaskMode.create_train()
         return self._parse_creator(file_name, mode)
 
@@ -123,7 +123,7 @@ def test(self):
         Load testset.
         '''
         file_name = "test.txt"
-        logger.info("load testset from %s" % file_name)
+        logger.info(f"load testset from {file_name}")
         mode = TaskMode.create_test()
         return self._parse_creator(file_name, mode)
 
@@ -132,7 +132,7 @@ def infer(self):
         Load infer set.
         '''
         file_name = "infer.txt"
-        logger.info("load inferset from %s" % file_name)
+        logger.info(f"load inferset from {file_name}")
         mode = TaskMode.create_infer()
         return self._parse_creator(file_name, mode)
 
diff --git a/test/legacy_test/dist_test.sh b/test/legacy_test/dist_test.sh
index 69a893a7ddc13..3ae7b209f4a00 100644
--- a/test/legacy_test/dist_test.sh
+++ b/test/legacy_test/dist_test.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -82,7 +82,7 @@ if [[ $exit_code -ne 0 ]]; then
 fi
 
 #display system context
-for i in {1..2}; do 
+for i in {1..2}; do
     sleep 3
     ps -aux
     netstat -anlp
diff --git a/test/legacy_test/gradient_checker.py b/test/legacy_test/gradient_checker.py
index 210db283b979a..41c668043e3f8 100644
--- a/test/legacy_test/gradient_checker.py
+++ b/test/legacy_test/gradient_checker.py
@@ -324,7 +324,7 @@ def _compute_analytical_jacobian_pir(
     filted_idx, filted_dx = zip(*filted)
 
     # get the name in feeds of dyi
-    name = 'dys_%s' % i
+    name = f'dys_{i}'
     np_t = np.array(feeds[name]).astype(np_type)
     shape = np_t.shape
     np_t = np_t.flatten()
@@ -392,7 +392,7 @@ def fail_test(msg):
     if in_pir_mode():
         analytical = []
         for i in range(len(y)):
-            name = 'dys_%s' % i
+            name = f'dys_{i}'
             feeds.update(
                 {
                     name: np.zeros(
@@ -780,7 +780,7 @@ def get_pir_static_double_grad(
             yi.persistable = True
             np_type = dtype_to_np_dtype(yi.dtype)
             dy = paddle.static.data(
-                name='Dgrad_%s' % i,
+                name=f'Dgrad_{i}',
                 shape=yi.shape,
                 dtype=np_type,
             )
@@ -797,7 +797,7 @@ def get_pir_static_double_grad(
             yi.persistable = True
             np_type = dtype_to_np_dtype(yi.dtype)
             dy = paddle.static.data(
-                name='Dgrad_%s' % i,
+                name=f'Dgrad_{i}',
                 shape=yi.shape,
                 dtype=np_type,
             )
@@ -851,12 +851,12 @@ def get_pir_static_double_grad(
         yi = y[i]
         np_type = dtype_to_np_dtype(yi.dtype)
         dy = paddle.static.data(
-            name='dys_%s' % i,
+            name=f'dys_{i}',
             shape=yi.shape,
             dtype=np_type,
         )
         value = np.ones(yi.shape, dtype=np_type)
-        feeds.update({'dys_%s' % i: value})
+        feeds.update({f'dys_{i}': value})
         dys.append(dy)
 
     # append second order backward
@@ -1130,7 +1130,7 @@ def get_pir_static_triple_grad(
             yi.persistable = True
             np_type = dtype_to_np_dtype(yi.dtype)
             dy = paddle.static.data(
-                name='Tgrad_%s' % i,
+                name=f'Tgrad_{i}',
                 shape=yi.shape,
                 dtype=np_type,
             )
@@ -1147,7 +1147,7 @@ def get_pir_static_triple_grad(
             yi.persistable = True
             np_type = dtype_to_np_dtype(yi.dtype)
             dy = paddle.static.data(
-                name='Tgrad_%s' % i,
+                name=f'Tgrad_{i}',
                 shape=yi.shape,
                 dtype=np_type,
             )
diff --git a/test/legacy_test/op.py b/test/legacy_test/op.py
index 0dec2f001188e..e60a0e63ae8dd 100644
--- a/test/legacy_test/op.py
+++ b/test/legacy_test/op.py
@@ -163,7 +163,7 @@ def __call__(self, *args, **kwargs):
                         new_attr.scalars.MergeFrom(item)
                 else:
                     raise NotImplementedError(
-                        "A not supported attribute type: %s." % (str(attr.type))
+                        f"A not supported attribute type: {str(attr.type)}."
                     )
         for attr_name, defalut_val in self.__extra_attrs__.items():
             user_defined_attr = kwargs.get(attr_name, None)
@@ -212,7 +212,7 @@ def __call__(self, *args, **kwargs):
                         new_attr.scalars.MergeFrom(item)
                 else:
                     raise NotImplementedError(
-                        "A not supported attribute type: %s." % (str(attr_type))
+                        f"A not supported attribute type: {str(attr_type)}."
                     )
 
         return op_desc
@@ -292,7 +292,7 @@ def types(self):
 
     def get_op_info(self, t):
         if t not in self.op_methods:
-            raise ValueError("The operator: %s is not registered." % t)
+            raise ValueError(f"The operator: {t} is not registered.")
         return self.op_methods.get(t)
 
     def get_op_input_names(self, type):
diff --git a/test/legacy_test/op_test.py b/test/legacy_test/op_test.py
index ed4e0f478ed38..eec710f01cf8e 100644
--- a/test/legacy_test/op_test.py
+++ b/test/legacy_test/op_test.py
@@ -114,7 +114,7 @@ def check_out_dtype(api_fn, in_specs, expect_dtypes, target_index=0, **configs):
                         )
                     input_t.append(
                         paddle.static.data(
-                            name='data_%s' % index, shape=shape, dtype=dtype
+                            name=f'data_{index}', shape=shape, dtype=dtype
                         )
                     )
 
@@ -223,7 +223,7 @@ def __get_elem__(tensor, i):
             return tensor._get_complex128_element(i)
         else:
             raise TypeError(
-                "Unsupported test data type %s." % tensor_to_check_dtype
+                f"Unsupported test data type {tensor_to_check_dtype}."
             )
 
     def __set_elem__(tensor, i, e):
@@ -251,7 +251,7 @@ def __set_elem__(tensor, i, e):
             return tensor._set_complex128_element(i, e)
         else:
             raise TypeError(
-                "Unsupported test data type %s." % tensor_to_check_dtype
+                f"Unsupported test data type {tensor_to_check_dtype}."
             )
 
     # we only compute gradient of one element each time.
@@ -501,7 +501,7 @@ def is_complex_test():
                 and not hasattr(cls, "exist_check_grad")
             ):
                 raise AssertionError(
-                    "This test of %s op needs check_grad." % cls.op_type
+                    f"This test of {cls.op_type} op needs check_grad."
                 )
 
             # check for op test with fp64 precision, but not check onednn op test for now
@@ -518,8 +518,7 @@ def is_complex_test():
                 and not cls.check_prim_pir
             ):
                 raise AssertionError(
-                    "This test of %s op needs check_grad with fp64 precision."
-                    % cls.op_type
+                    f"This test of {cls.op_type} op needs check_grad with fp64 precision."
                 )
 
             if (
@@ -1061,7 +1060,7 @@ def create_var(
                     name_temp = name
                 else:
                     nplist_value_temp = np_list[name]
-                    name_temp = unique_name.generate("%s_out" % (name))
+                    name_temp = unique_name.generate(f"{name}_out")
                 v = create_var(
                     nplist_value_temp,
                     name_temp,
@@ -1184,10 +1183,9 @@ def cal_python_api(python_api, args, kernel_sig):
                 return None
             if not hasattr(self, "python_api"):
                 print(kernel_sig)
-            assert hasattr(self, "python_api"), (
-                "Detect there is KernelSignature for `%s` op, please set the `self.python_api` if you set check_dygraph = True"
-                % self.op_type
-            )
+            assert hasattr(
+                self, "python_api"
+            ), f"Detect there is KernelSignature for `{self.op_type}` op, please set the `self.python_api` if you set check_dygraph = True"
             args = OpTestUtils.prepare_python_api_arguments(
                 self.python_api,
                 dygraph_tensor_inputs,
@@ -1288,10 +1286,9 @@ def get_kernel_signature(self, place, egr_inps=None, egr_oups=None):
                 return None
             if not hasattr(self, "python_api"):
                 print(kernel_sig)
-            assert hasattr(self, "python_api"), (
-                "Detect there is KernelSignature for `%s` op, please set the `self.python_api` if you set check_dygraph = True"
-                % self.op_type
-            )
+            assert hasattr(
+                self, "python_api"
+            ), f"Detect there is KernelSignature for `{self.op_type}` op, please set the `self.python_api` if you set check_dygraph = True"
             return kernel_sig
 
     def get_ir_input_attr_dict_and_feed(self, stop_gradient):
@@ -2573,7 +2570,7 @@ def _is_skip_name(self, name):
                 not in no_check_set_white_list.no_check_set_white_list
             ):
                 raise AssertionError(
-                    "no_check_set of op %s must be set to None." % self.op_type
+                    f"no_check_set of op {self.op_type} must be set to None."
                 )
 
         if check_prim:
@@ -3091,7 +3088,7 @@ def check_grad_with_place_for_static(
             analytic_grads,
             inputs_to_check,
             max_relative_error,
-            "Gradient Check On %s" % str(place),
+            f"Gradient Check On {str(place)}",
             atol=atol,
         )
 
@@ -3366,7 +3363,7 @@ def check_grad_with_place(
                     dygraph_dygraph_grad,
                     inputs_to_check,
                     max_relative_error,
-                    "Gradient Check On %s" % str(place),
+                    f"Gradient Check On {str(place)}",
                     atol=atol,
                 )
 
@@ -3406,7 +3403,7 @@ def check_grad_with_place(
                     pir_grad,
                     inputs_to_check,
                     max_relative_error,
-                    "Gradient Check On %s" % str(place),
+                    f"Gradient Check On {str(place)}",
                     atol=atol,
                 )
 
@@ -3484,7 +3481,7 @@ def _get_dygraph_grad(
                         )
                     else:
                         raise TypeError(
-                            "Unsupported test data type %s." % type(cast_input)
+                            f"Unsupported test data type {type(cast_input)}."
                         )
 
                 outputs = {}
@@ -3850,12 +3847,12 @@ def construct_output_dict_by_kernel_sig(ret_tuple, output_sig):
                         range(len(user_defined_grad_outputs)),
                     ):
                         grad_val = paddle.static.data(
-                            name='val_grad_%s' % idx,
+                            name=f'val_grad_{idx}',
                             shape=grad_out_value.shape,
                             dtype=grad_out_value.dtype,
                         )
                         grad_outputs.append(grad_val)
-                        feed.update({'val_grad_%s' % idx: grad_out_value})
+                        feed.update({f'val_grad_{idx}': grad_out_value})
                     # delete the inputs which no need to calculate grad
                     for no_grad_val in no_grad_set:
                         del static_inputs[no_grad_val]
@@ -3894,8 +3891,7 @@ def construct_output_dict_by_kernel_sig(ret_tuple, output_sig):
                             )
                         else:
                             raise TypeError(
-                                "Unsupported test data type %s."
-                                % type(cast_input)
+                                f"Unsupported test data type {type(cast_input)}."
                             )
 
                     outputs = {}
diff --git a/test/legacy_test/parallel_test.sh b/test/legacy_test/parallel_test.sh
index 551b7cdb7a43c..893163700a55d 100644
--- a/test/legacy_test/parallel_test.sh
+++ b/test/legacy_test/parallel_test.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/test/legacy_test/prim_op_test.py b/test/legacy_test/prim_op_test.py
index 6894d37a2839a..c059499f43e16 100644
--- a/test/legacy_test/prim_op_test.py
+++ b/test/legacy_test/prim_op_test.py
@@ -100,8 +100,7 @@ def _get_kernel_signature(
             """we think the kernel_sig is missing."""
             kernel_sig = None
             print(
-                "[Warning: op_test.py] Kernel Signature is not found for %s, fall back to intermediate state."
-                % op_type
+                f"[Warning: op_test.py] Kernel Signature is not found for {op_type}, fall back to intermediate state."
             )
         return kernel_sig
 
@@ -677,9 +676,9 @@ def check_static_comp(self):
                 # ensure the operator not in program if check_prim is True
                 if not in_pir_mode():
                     forward_ops = [op.type for op in main_program.blocks[0].ops]
-                    assert self.op_type not in forward_ops, (
-                        "%s shouldn't appear in program when check_prim is True"
-                    ) % (self.op_type)
+                    assert (
+                        self.op_type not in forward_ops
+                    ), f"{self.op_type} shouldn't appear in program when check_prim is True"
                 exe = paddle.static.Executor(self.place)
                 exe.run(startup_program)
                 ret = exe.run(main_program, feed=feed, fetch_list=ret)
@@ -761,9 +760,9 @@ def check_jit_comp(self):
                     .forward_program.block(0)
                     .ops
                 ]
-                assert self.op_type not in forward_ops, (
-                    "%s shouldn't appear in program when check_prim is True"
-                ) % (self.op_type)
+                assert (
+                    self.op_type not in forward_ops
+                ), f"{self.op_type} shouldn't appear in program when check_prim is True"
             ret = flatten(_as_list(net(args)))
             ret = paddle.utils.map_structure(lambda x: x.numpy(), ret)
             if OpTestUtils.is_bfloat16_type(self.dtype):
@@ -854,9 +853,9 @@ def check_jit_comp_with_cinn(self):
                 .forward_program.block(0)
                 .ops
             ]
-            assert self.op_type not in forward_ops, (
-                "%s shouldn't appear in program when check_prim is True"
-            ) % (self.op_type)
+            assert (
+                self.op_type not in forward_ops
+            ), f"{self.op_type} shouldn't appear in program when check_prim is True"
             ret = flatten(_as_list(net(args)))
             ret = paddle.utils.map_structure(lambda x: x.numpy(), ret)
             if OpTestUtils.is_bfloat16_type(self.dtype):
@@ -1160,9 +1159,9 @@ def check_static_comp(self):
                 if not in_pir_mode():
                     ops = [op.type for op in main_program.blocks[0].ops]
                     backward_op_type = self.op_type + "_grad"
-                    assert backward_op_type not in ops, (
-                        "%s shouldn't appear in program when check_prim is True"
-                    ) % (backward_op_type)
+                    assert (
+                        backward_op_type not in ops
+                    ), f"{backward_op_type} shouldn't appear in program when check_prim is True"
                 elif self.prim_op_type == "prim":
                     grad_ops = []
                     for op in main_program.global_block().ops:
@@ -1261,9 +1260,9 @@ def check_jit_comp(self):
                     .ops
                 ]
                 backward_op_type = self.op_type + "_grad"
-                assert backward_op_type not in ops, (
-                    "%s shouldn't appear in program when check_prim is True"
-                ) % (backward_op_type)
+                assert (
+                    backward_op_type not in ops
+                ), f"{backward_op_type} shouldn't appear in program when check_prim is True"
             out = _as_list(net(args))
             if hasattr(self.op_test, "python_out_sig"):
                 outputs_sig = self.op_test.python_out_sig
@@ -1387,9 +1386,9 @@ def check_jit_comp_with_cinn(self):
                 .ops
             ]
             backward_op_type = self.op_type + "_grad"
-            assert backward_op_type not in ops, (
-                "%s shouldn't appear in program when check_prim is True"
-            ) % (backward_op_type)
+            assert (
+                backward_op_type not in ops
+            ), f"{backward_op_type} shouldn't appear in program when check_prim is True"
 
             out = _as_list(net(args))
             if hasattr(self.op_test, "python_out_sig"):
diff --git a/test/legacy_test/run_server_for_communicator_geo.py b/test/legacy_test/run_server_for_communicator_geo.py
index 4f4173e5a2d0f..31bdddda31a15 100644
--- a/test/legacy_test/run_server_for_communicator_geo.py
+++ b/test/legacy_test/run_server_for_communicator_geo.py
@@ -16,7 +16,7 @@
 import sys
 
 sys.path.append("../deprecated/legacy_test")
-from test_communicator_geo import TestCommunicatorGeoEnd2End
+from test_communicator_geo_deprecated import TestCommunicatorGeoEnd2End
 
 import paddle
 
diff --git a/test/legacy_test/test_ZeroPad1d.py b/test/legacy_test/test_ZeroPad1d.py
new file mode 100644
index 0000000000000..31baf6a7cf246
--- /dev/null
+++ b/test/legacy_test/test_ZeroPad1d.py
@@ -0,0 +1,90 @@
+#   Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle import to_tensor
+from paddle.nn import ZeroPad1D
+
+
+class TestZeroPad1dAPI(unittest.TestCase):
+    def setUp(self):
+        if paddle.is_compiled_with_cuda():
+            paddle.device.set_device('gpu:0')
+        else:
+            paddle.device.set_device('cpu')
+        self.shape = [4, 6, 6]
+        self.support_dtypes = ['float32', 'float64', 'int32', 'int64']
+
+    def test_support_dtypes(self):
+        for dtype in self.support_dtypes:
+            pad = 2
+            x = np.random.randint(-255, 255, size=self.shape).astype(dtype)
+            expect_res = np.pad(
+                x,
+                [[0, 0], [0, 0], [pad, pad]],
+                mode='constant',
+                constant_values=0,
+            )
+
+            x_tensor = to_tensor(x).astype(dtype)
+            zeropad1d = ZeroPad1D(padding=pad)
+            ret_res = zeropad1d(x_tensor).numpy()
+            np.testing.assert_allclose(expect_res, ret_res, rtol=1e-05)
+
+    def test_support_pad2(self):
+        pad = [1, 2]
+        x = np.random.randint(-255, 255, size=self.shape)
+        expect_res = np.pad(
+            x, [[0, 0], [0, 0], pad], mode='constant', constant_values=0
+        )
+
+        x_tensor = to_tensor(x)
+        zeropad1d = ZeroPad1D(padding=pad)
+        ret_res = zeropad1d(x_tensor).numpy()
+        np.testing.assert_allclose(expect_res, ret_res, rtol=1e-05)
+
+    def test_support_pad3(self):
+        pad = (1, 2)
+        x = np.random.randint(-255, 255, size=self.shape)
+        expect_res = np.pad(x, [[0, 0], [0, 0], [pad[0], pad[1]]])
+
+        x_tensor = to_tensor(x)
+        zeropad1d = ZeroPad1D(padding=pad)
+        ret_res = zeropad1d(x_tensor).numpy()
+        np.testing.assert_allclose(expect_res, ret_res, rtol=1e-05)
+
+    def test_support_pad4(self):
+        pad = [1, 2]
+        x = np.random.randint(-255, 255, size=self.shape)
+        expect_res = np.pad(x, [[0, 0], [0, 0], [pad[0], pad[1]]])
+
+        x_tensor = to_tensor(x)
+        pad_tensor = to_tensor(pad, dtype='int32')
+        zeropad1d = ZeroPad1D(padding=pad_tensor)
+        ret_res = zeropad1d(x_tensor).numpy()
+        np.testing.assert_allclose(expect_res, ret_res, rtol=1e-05)
+
+    def test_repr(self):
+        pad = [1, 2]
+        zeropad1d = ZeroPad1D(padding=pad)
+        name_str = zeropad1d.extra_repr()
+        assert name_str == 'padding=[1, 2], data_format=NCL'
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/legacy_test/test_ZeroPad3d.py b/test/legacy_test/test_ZeroPad3d.py
new file mode 100644
index 0000000000000..8cc7a45c959df
--- /dev/null
+++ b/test/legacy_test/test_ZeroPad3d.py
@@ -0,0 +1,117 @@
+#   Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle import to_tensor
+from paddle.nn import ZeroPad3D
+
+
+class TestZeroPad3DAPI(unittest.TestCase):
+    def setUp(self):
+        if paddle.is_compiled_with_cuda():
+            paddle.device.set_device('gpu:0')
+        else:
+            paddle.device.set_device('cpu')
+        self.shape = [4, 3, 6, 6, 6]
+        self.support_dtypes = ['float32', 'float64', 'int32', 'int64']
+
+    def test_support_dtypes(self):
+        for dtype in self.support_dtypes:
+            pad = 2
+            x = np.random.randint(-255, 255, size=self.shape).astype(dtype)
+            expect_res = np.pad(
+                x,
+                [[0, 0], [0, 0], [pad, pad], [pad, pad], [pad, pad]],
+                mode='constant',
+                constant_values=0,
+            )
+
+            x_tensor = to_tensor(x).astype(dtype)
+            zeropad3d = ZeroPad3D(padding=pad)
+            ret_res = zeropad3d(x_tensor).numpy()
+            np.testing.assert_allclose(expect_res, ret_res, rtol=1e-05)
+
+    def test_support_pad2(self):
+        pad = [1, 2, 3, 4, 5, 6]
+        x = np.random.randint(-255, 255, size=self.shape)
+        expect_res = np.pad(
+            x,
+            [
+                [0, 0],
+                [0, 0],
+                [pad[4], pad[5]],
+                [pad[2], pad[3]],
+                [pad[0], pad[1]],
+            ],
+            mode='constant',
+            constant_values=0,
+        )
+
+        x_tensor = to_tensor(x)
+        zeropad3d = ZeroPad3D(padding=pad)
+        ret_res = zeropad3d(x_tensor).numpy()
+        np.testing.assert_allclose(expect_res, ret_res, rtol=1e-05)
+
+    def test_support_pad3(self):
+        pad = (1, 2, 3, 4, 5, 6)
+        x = np.random.randint(-255, 255, size=self.shape)
+        expect_res = np.pad(
+            x,
+            [
+                [0, 0],
+                [0, 0],
+                [pad[4], pad[5]],
+                [pad[2], pad[3]],
+                [pad[0], pad[1]],
+            ],
+        )
+
+        x_tensor = to_tensor(x)
+        zeropad3d = ZeroPad3D(padding=pad)
+        ret_res = zeropad3d(x_tensor).numpy()
+        np.testing.assert_allclose(expect_res, ret_res, rtol=1e-05)
+
+    def test_support_pad4(self):
+        pad = [1, 2, 3, 4, 5, 6]
+        x = np.random.randint(-255, 255, size=self.shape)
+        expect_res = np.pad(
+            x,
+            [
+                [0, 0],
+                [0, 0],
+                [pad[4], pad[5]],
+                [pad[2], pad[3]],
+                [pad[0], pad[1]],
+            ],
+        )
+
+        x_tensor = to_tensor(x)
+        pad_tensor = to_tensor(pad, dtype='int32')
+        zeropad3d = ZeroPad3D(padding=pad_tensor)
+        ret_res = zeropad3d(x_tensor).numpy()
+        np.testing.assert_allclose(expect_res, ret_res, rtol=1e-05)
+
+    def test_repr(self):
+        pad = pad = [1, 2, 3, 4, 5, 6]
+        zeropad3d = ZeroPad3D(padding=pad)
+        name_str = zeropad3d.extra_repr()
+        assert name_str == 'padding=[1, 2, 3, 4, 5, 6], data_format=NCDHW'
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_accuracy_op.py b/test/legacy_test/test_accuracy_op.py
similarity index 99%
rename from test/deprecated/legacy_test/test_accuracy_op.py
rename to test/legacy_test/test_accuracy_op.py
index 44c4cfa7c49ac..bf6d86d10da9e 100755
--- a/test/deprecated/legacy_test/test_accuracy_op.py
+++ b/test/legacy_test/test_accuracy_op.py
@@ -126,7 +126,7 @@ def test_type_errors(self):
                 self.assertRaises(TypeError, paddle.metric.accuracy, x2, label)
 
                 x3 = paddle.static.data(
-                    name='input', shape=[-1, 2], dtype="float16"
+                    name='input', shape=[-1, 2], dtype="float32"
                 )
                 paddle.static.accuracy(input=x3, label=label)
                 paddle.metric.accuracy(input=x3, label=label)
diff --git a/test/legacy_test/test_activation_op.py b/test/legacy_test/test_activation_op.py
index 7806017bbfeed..4de793c943265 100644
--- a/test/legacy_test/test_activation_op.py
+++ b/test/legacy_test/test_activation_op.py
@@ -365,19 +365,19 @@ def test_out_name(self):
                 data = paddle.static.data(
                     name="X", shape=[-1, 1], dtype="float32"
                 )
-                out = eval("paddle.%s(data, name='Y')" % self.op_type)
+                out = eval(f"paddle.{self.op_type}(data, name='Y')")
                 place = base.CPUPlace()
                 exe = base.Executor(place)
                 (result,) = exe.run(feed={"X": np_x}, fetch_list=[out])
-                expected = eval("np.%s(np_x)" % self.op_type)
+                expected = eval(f"np.{self.op_type}(np_x)")
                 np.testing.assert_allclose(result, expected, rtol=1e-05)
 
     def test_dygraph(self):
         with base.dygraph.guard():
             np_x = np.array([0.1])
             x = paddle.to_tensor(np_x)
-            z = eval("paddle.%s(x).numpy()" % self.op_type)
-            z_expected = eval("np.%s(np_x)" % self.op_type)
+            z = eval(f"paddle.{self.op_type}(x).numpy()")
+            z_expected = eval(f"np.{self.op_type}(np_x)")
             np.testing.assert_allclose(z, z_expected, rtol=1e-05)
 
 
@@ -3287,26 +3287,34 @@ def test_check_grad(self):
         self.check_grad(
             ['X'],
             'Out',
-            check_prim=True
-            if self.dtype not in [np.complex64, np.complex128]
-            else False,
+            check_prim=(
+                True
+                if self.dtype not in [np.complex64, np.complex128]
+                else False
+            ),
             only_check_prim=self.if_only_check_prim(),
             check_pir=True,
-            check_prim_pir=True
-            if self.dtype not in [np.complex64, np.complex128]
-            else False,
+            check_prim_pir=(
+                True
+                if self.dtype not in [np.complex64, np.complex128]
+                else False
+            ),
             check_pir_onednn=self.check_pir_onednn,
         )
 
     def test_check_output(self):
         self.check_output(
-            check_prim=True
-            if self.dtype not in [np.complex64, np.complex128]
-            else False,
+            check_prim=(
+                True
+                if self.dtype not in [np.complex64, np.complex128]
+                else False
+            ),
             check_pir=True,
-            check_prim_pir=True
-            if self.dtype not in [np.complex64, np.complex128]
-            else False,
+            check_prim_pir=(
+                True
+                if self.dtype not in [np.complex64, np.complex128]
+                else False
+            ),
             check_pir_onednn=self.check_pir_onednn,
         )
 
@@ -4880,8 +4888,8 @@ def test_errors(self):
                     F.softsign(x_fp16)
 
 
-def ref_thresholded_relu(x, threshold=1.0):
-    out = (x > threshold) * x
+def ref_thresholded_relu(x, threshold=1.0, value=0.0):
+    out = (x > threshold) * x + (x <= threshold) * value
     return out
 
 
@@ -4893,15 +4901,16 @@ def setUp(self):
         self.python_api = paddle.nn.functional.thresholded_relu
 
         threshold = 15
+        value = 5
 
         np.random.seed(1024)
         x = np.random.uniform(-20, 20, self.shape).astype(self.dtype)
         x[np.abs(x) < 0.005] = 0.02
-        out = ref_thresholded_relu(x, threshold)
+        out = ref_thresholded_relu(x, threshold, value)
 
         self.inputs = {'X': OpTest.np_dtype_to_base_dtype(x)}
         self.outputs = {'Out': out}
-        self.attrs = {"threshold": threshold}
+        self.attrs = {"threshold": threshold, "value": value}
         self.convert_input_output()
 
     def init_shape(self):
@@ -4929,6 +4938,7 @@ class TestThresholdedReluAPI(unittest.TestCase):
     # test paddle.nn.ThresholdedReLU, paddle.nn.functional.thresholded_relu
     def setUp(self):
         self.threshold = 15
+        self.value = 5
         np.random.seed(1024)
         self.x_np = np.random.uniform(-20, 20, [10, 12]).astype(np.float64)
         self.x_np[np.abs(self.x_np) < 0.005] = 0.02
@@ -4943,22 +4953,30 @@ def test_static_api(self):
         with static_guard():
             with paddle.static.program_guard(paddle.static.Program()):
                 x = paddle.static.data('X', self.x_np.shape, self.x_np.dtype)
-                out1 = F.thresholded_relu(x, self.threshold)
-                thresholded_relu = paddle.nn.ThresholdedReLU(self.threshold)
+                out1 = F.thresholded_relu(x, self.threshold, self.value)
+                thresholded_relu = paddle.nn.ThresholdedReLU(
+                    self.threshold, self.value
+                )
                 out2 = thresholded_relu(x)
                 exe = paddle.static.Executor(self.place)
                 res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
-            out_ref = ref_thresholded_relu(self.x_np, self.threshold)
+            out_ref = ref_thresholded_relu(
+                self.x_np, self.threshold, self.value
+            )
             for r in res:
                 np.testing.assert_allclose(out_ref, r, rtol=1e-05)
 
     def test_dygraph_api(self):
         with dynamic_guard():
             x = paddle.to_tensor(self.x_np)
-            out1 = F.thresholded_relu(x, self.threshold)
-            thresholded_relu = paddle.nn.ThresholdedReLU(self.threshold)
+            out1 = F.thresholded_relu(x, self.threshold, self.value)
+            thresholded_relu = paddle.nn.ThresholdedReLU(
+                self.threshold, self.value
+            )
             out2 = thresholded_relu(x)
-            out_ref = ref_thresholded_relu(self.x_np, self.threshold)
+            out_ref = ref_thresholded_relu(
+                self.x_np, self.threshold, self.value
+            )
             for r in [out1, out2]:
                 np.testing.assert_allclose(out_ref, r.numpy(), rtol=1e-05)
 
@@ -5359,7 +5377,7 @@ def create_test_act_fp16_class(
     enable_cinn=False,
     check_pir=False,
     grad_atol=1e-2,
-    **kwargs
+    **kwargs,
 ):
     @unittest.skipIf(
         not paddle.is_compiled_with_cuda(), "core is not compiled with CUDA"
@@ -5556,7 +5574,7 @@ def create_test_act_bf16_class(
     check_pir=False,
     check_prim_pir=False,
     grad_atol=1e-2,
-    **kwargs
+    **kwargs,
 ):
     @unittest.skipIf(
         not core.is_compiled_with_cuda()
diff --git a/test/deprecated/legacy_test/test_adamax_api.py b/test/legacy_test/test_adamax_api.py
similarity index 66%
rename from test/deprecated/legacy_test/test_adamax_api.py
rename to test/legacy_test/test_adamax_api.py
index 1fc1878d81995..a995659df4c10 100644
--- a/test/deprecated/legacy_test/test_adamax_api.py
+++ b/test/legacy_test/test_adamax_api.py
@@ -17,7 +17,6 @@
 import numpy as np
 
 import paddle
-from paddle import base
 
 
 class TestAdamaxAPI(unittest.TestCase):
@@ -36,34 +35,6 @@ def test_adamax_api_dygraph(self):
         adam.step()
         adam.clear_gradients()
 
-    def test_adamax_api(self):
-        paddle.enable_static()
-        place = base.CPUPlace()
-        shape = [2, 3, 8, 8]
-        exe = base.Executor(place)
-        train_prog = base.Program()
-        startup = base.Program()
-        with base.program_guard(train_prog, startup):
-            with base.unique_name.guard():
-                data = paddle.static.data(name="data", shape=shape)
-                conv = paddle.static.nn.conv2d(data, 8, 3)
-                loss = paddle.mean(conv)
-                beta1 = 0.85
-                beta2 = 0.95
-                opt = paddle.optimizer.Adamax(
-                    learning_rate=1e-5,
-                    beta1=beta1,
-                    beta2=beta2,
-                    weight_decay=0.01,
-                    epsilon=1e-8,
-                )
-                opt.minimize(loss)
-
-        exe.run(startup)
-        data_np = np.random.random(shape).astype('float32')
-        rets = exe.run(train_prog, feed={"data": data_np}, fetch_list=[loss])
-        assert rets[0] is not None
-
 
 class TestAdamaxAPIGroup(TestAdamaxAPI):
     def test_adamax_api_dygraph(self):
diff --git a/test/deprecated/legacy_test/test_add_position_encoding_op.py b/test/legacy_test/test_add_position_encoding_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_add_position_encoding_op.py
rename to test/legacy_test/test_add_position_encoding_op.py
diff --git a/test/deprecated/legacy_test/test_addmm_op.py b/test/legacy_test/test_addmm_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_addmm_op.py
rename to test/legacy_test/test_addmm_op.py
diff --git a/test/deprecated/legacy_test/test_affine_channel_op.py b/test/legacy_test/test_affine_channel_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_affine_channel_op.py
rename to test/legacy_test/test_affine_channel_op.py
diff --git a/test/deprecated/legacy_test/test_affine_grid_op.py b/test/legacy_test/test_affine_grid_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_affine_grid_op.py
rename to test/legacy_test/test_affine_grid_op.py
diff --git a/test/deprecated/legacy_test/test_argsort_op.py b/test/legacy_test/test_argsort_op.py
similarity index 68%
rename from test/deprecated/legacy_test/test_argsort_op.py
rename to test/legacy_test/test_argsort_op.py
index 58597766644f5..e1786ada841bc 100644
--- a/test/deprecated/legacy_test/test_argsort_op.py
+++ b/test/legacy_test/test_argsort_op.py
@@ -20,9 +20,6 @@
 import paddle
 from paddle import base
 from paddle.base import core
-from paddle.base.backward import append_backward
-from paddle.base.executor import Executor
-from paddle.base.framework import Program, grad_var_name
 from paddle.pir_utils import test_with_pir_api
 
 np.random.seed(123)
@@ -66,293 +63,6 @@ def create_tensor(np_data, place):
     return tensor
 
 
-class TestArgsortOpCPU(unittest.TestCase):
-    def setup_program(self):
-        self.main_program = Program()
-        self.startup_program = Program()
-        self.init_place()
-
-    def setUp(self):
-        paddle.enable_static()
-        self.init_axis()
-        self.init_datatype()
-        self.init_direction()
-        self.init_inputshape()
-
-        self.setup_program()
-        self.feed_data_field = {"x", "label"}
-        self.grad_data_field = {"x"}
-
-        self.py_argsort = PyArgsort(
-            self.input_shape, self.axis, self.descending, self.dtype
-        )
-
-        with base.program_guard(self.main_program, self.startup_program):
-            x = paddle.static.data(
-                name="x", shape=[-1] + list(self.input_shape), dtype=self.dtype
-            )
-            x.stop_gradient = False
-            x.desc.set_need_check_feed(False)
-            label = paddle.static.data(
-                name="label",
-                shape=[-1] + list(self.input_shape),
-                dtype=self.dtype,
-            )
-            label.desc.set_need_check_feed(False)
-            self.index = paddle.argsort(
-                x=x, axis=self.axis, descending=self.descending
-            )
-            self.sorted_x = paddle.sort(
-                x=x, axis=self.axis, descending=self.descending
-            )
-            self.sorted_x.stop_gradient = False
-            loss = paddle.multiply(self.sorted_x, label)
-            self.loss = paddle.sum(loss)
-
-    def forward(self):
-        self.feed_map = {
-            x: create_tensor(getattr(self.py_argsort, x), self.place)
-            for x in self.feed_data_field
-        }
-        exe = Executor(self.place)
-        out = exe.run(
-            self.main_program,
-            feed=self.feed_map,
-            fetch_list=[self.index, self.sorted_x, self.loss],
-        )
-        return out
-
-    def backward(self):
-        self.feed_map = {
-            x: create_tensor(getattr(self.py_argsort, x), self.place)
-            for x in self.feed_data_field
-        }
-        fetch_list = [
-            self.main_program.global_block().var(grad_var_name(x))
-            for x in self.grad_data_field
-        ]
-        exe = Executor(self.place)
-        out = exe.run(
-            self.main_program,
-            feed=self.feed_map,
-            fetch_list=fetch_list,
-            return_numpy=False,
-        )
-        return out
-
-    def test_backward(self, numeric_grad_delta=1e-5, max_relative_error=1e-7):
-        self.check_forward()
-
-        with base.program_guard(self.main_program, self.startup_program):
-            append_backward(self.loss)
-
-        ana_grad = [np.array(x) for x in self.backward()]
-
-        num_grad = self.get_numerical_gradient(delta=numeric_grad_delta)
-        self.assert_is_close(
-            num_grad,
-            ana_grad,
-            'x',
-            max_relative_error=max_relative_error,
-            msg_prefix="Gradient Check On %s" % str(self.place),
-        )
-
-    def check_forward(self):
-        pd_outputs = self.forward()
-        py_outputs = self.py_argsort.forward()
-        for pd_output, py_output in zip(pd_outputs, py_outputs):
-            self.assertEqual(pd_output.shape, py_output.shape)
-            np.testing.assert_allclose(
-                pd_output, py_output, rtol=1e-05, atol=0, equal_nan=False
-            )
-
-    def get_numerical_gradient(self, delta=1e-7):
-        if self.dtype == 'float16':
-            delta = np.array(delta).astype(np.float16)
-        feed_list = [getattr(self.py_argsort, x) for x in self.grad_data_field]
-        grad_list = [np.zeros_like(x) for x in feed_list]
-        for feed, grad in zip(feed_list, grad_list):
-            for f, g in np.nditer([feed, grad], op_flags=['readwrite']):
-                o = float(f)
-                f[...] = o + delta
-                y_pos = self.forward()[2]
-
-                f[...] = o - delta
-                y_neg = self.forward()[2]
-
-                f[...] = o
-                dout_dfeed = (y_pos - y_neg) / (delta * 2)
-                g[...] = dout_dfeed
-
-        return grad_list
-
-    def assert_is_close(
-        self,
-        numeric_grads,
-        analytic_grads,
-        names,
-        max_relative_error,
-        msg_prefix,
-    ):
-        for a, b, name in zip(numeric_grads, analytic_grads, names):
-            abs_a = np.abs(a)
-            abs_a[abs_a < 1e-3] = 1
-
-            diff_mat = np.abs(a - b) / abs_a
-            max_diff = np.max(diff_mat)
-
-            def err_msg():
-                offset = np.argmax(diff_mat > max_relative_error)
-                return (
-                    "%s error, %s variable %s max gradient diff %f over limit %f, "
-                    "the first error element is %d, expected %f, but got %f."
-                ) % (
-                    'argsort',
-                    msg_prefix,
-                    name,
-                    max_diff,
-                    max_relative_error,
-                    offset,
-                    a.flatten()[offset],
-                    b.flatten()[offset],
-                )
-
-            self.assertLessEqual(max_diff, max_relative_error, err_msg())
-
-    def init_axis(self):
-        self.axis = -1
-
-    def init_datatype(self):
-        self.dtype = "float64"
-
-    def init_direction(self):
-        self.descending = False
-
-    def init_inputshape(self):
-        self.input_shape = (2, 2, 2, 2, 3)
-
-    def init_place(self):
-        self.place = core.CPUPlace()
-
-
-class TestArgsortOpGPU(TestArgsortOpCPU):
-    def init_place(self):
-        if core.is_compiled_with_cuda():
-            self.place = core.CUDAPlace(0)
-        else:
-            self.place = core.CPUPlace()
-
-
-class TestArgsortOpAxis0CPU(TestArgsortOpCPU):
-    def init_axis(self):
-        self.axis = 0
-
-
-class TestArgsortOpAxis0GPU(TestArgsortOpGPU):
-    def init_axis(self):
-        self.axis = 0
-
-
-class TestArgsortOpAxis1CPU(TestArgsortOpCPU):
-    def init_axis(self):
-        self.axis = 1
-
-
-class TestArgsortOpAxis1GPU(TestArgsortOpGPU):
-    def init_axis(self):
-        self.axis = 1
-
-
-class TestArgsortOpAxis2CPU(TestArgsortOpCPU):
-    def init_axis(self):
-        self.axis = 2
-
-
-class TestArgsortOpAxis2GPU(TestArgsortOpGPU):
-    def init_axis(self):
-        self.axis = 2
-
-
-class TestArgsortOpAxisNeg1CPU(TestArgsortOpCPU):
-    def init_axis(self):
-        self.axis = -1
-
-
-class TestArgsortOpAxisNeg1GPU(TestArgsortOpGPU):
-    def init_axis(self):
-        self.axis = -1
-
-
-class TestArgsortOpAxisNeg2CPU(TestArgsortOpCPU):
-    def init_axis(self):
-        self.axis = -2
-
-
-class TestArgsortOpAxisNeg2GPU(TestArgsortOpGPU):
-    def init_axis(self):
-        self.axis = -2
-
-
-class TestArgsortOpDescendingAxisCPU(TestArgsortOpCPU):
-    def init_direction(self):
-        self.descending = True
-
-
-class TestArgsortOpDescendingAxisGPU(TestArgsortOpGPU):
-    def init_direction(self):
-        self.descending = True
-
-
-class TestArgsortOpDescendingAxis0CPU(TestArgsortOpAxis0CPU):
-    def init_direction(self):
-        self.descending = True
-
-
-class TestArgsortOpDescendingAxis0GPU(TestArgsortOpAxis0GPU):
-    def init_direction(self):
-        self.descending = True
-
-
-class TestArgsortOpDescendingAxis1CPU(TestArgsortOpAxis1CPU):
-    def init_direction(self):
-        self.descending = True
-
-
-class TestArgsortOpDescendingAxis1GPU(TestArgsortOpAxis1GPU):
-    def init_direction(self):
-        self.descending = True
-
-
-class TestArgsortOpDescendingAxis2CPU(TestArgsortOpAxis2CPU):
-    def init_direction(self):
-        self.descending = True
-
-
-class TestArgsortOpDescendingAxis2GPU(TestArgsortOpAxis2GPU):
-    def init_direction(self):
-        self.descending = True
-
-
-class TestArgsortOpDescendingAxisNeg1CPU(TestArgsortOpAxisNeg1CPU):
-    def init_direction(self):
-        self.descending = True
-
-
-class TestArgsortOpDescendingAxisNeg1GPU(TestArgsortOpAxisNeg1GPU):
-    def init_direction(self):
-        self.descending = True
-
-
-class TestArgsortOpDescendingAxisNeg2CPU(TestArgsortOpAxisNeg2CPU):
-    def init_direction(self):
-        self.descending = True
-
-
-class TestArgsortOpDescendingAxisNeg2GPU(TestArgsortOpAxisNeg2GPU):
-    def init_direction(self):
-        self.descending = True
-
-
 class TestArgsortErrorOnCPU(unittest.TestCase):
     def setUp(self):
         self.place = core.CPUPlace()
diff --git a/test/deprecated/legacy_test/test_assign_op.py b/test/legacy_test/test_assign_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_assign_op.py
rename to test/legacy_test/test_assign_op.py
diff --git a/test/deprecated/legacy_test/test_atan2_op.py b/test/legacy_test/test_atan2_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_atan2_op.py
rename to test/legacy_test/test_atan2_op.py
diff --git a/test/deprecated/legacy_test/test_attribute_var.py b/test/legacy_test/test_attribute_var.py
similarity index 82%
rename from test/deprecated/legacy_test/test_attribute_var.py
rename to test/legacy_test/test_attribute_var.py
index e06e8a3d80d50..cdae49ba0741a 100644
--- a/test/deprecated/legacy_test/test_attribute_var.py
+++ b/test/legacy_test/test_attribute_var.py
@@ -66,43 +66,6 @@ def infer_prog(self):
         return res
 
 
-class TestDropout(UnittestBase):
-    def init_info(self):
-        self.shapes = [[10, 10]]
-        self.save_path = os.path.join(self.temp_dir.name, 'dropout')
-
-    def test_static(self):
-        main_prog = Program()
-        startup_prog = Program()
-        with program_guard(main_prog, startup_prog):
-            fc = paddle.nn.Linear(10, 10)
-            x = paddle.randn(self.shapes[0])
-            x.stop_gradient = False
-            feat = fc(x)
-            # p is a Variable
-            p = paddle.randn([1])
-            out = paddle.nn.functional.dropout(feat, p=p)
-            sgd = paddle.optimizer.SGD()
-            sgd.minimize(paddle.mean(out))
-            # test _to_string
-            self.assertTrue("Var[" in str(main_prog))
-
-            exe = paddle.static.Executor()
-            exe.run(startup_prog)
-            res = exe.run(fetch_list=[x, out])
-            # export model
-            paddle.static.save_inference_model(self.save_path, [x], [out], exe)
-
-            # Test for Inference Predictor
-            infer_out = self.infer_prog()
-            self.assertEqual(infer_out.shape, (10, 10))
-
-            self.assertEqual(
-                main_prog.block(0).ops[4].all_attrs()['dropout_prob'].name,
-                p.name,
-            )
-
-
 class TestTileTensorList(UnittestBase):
     def init_info(self):
         self.shapes = [[2, 3, 4]]
diff --git a/test/legacy_test/test_backward.py b/test/legacy_test/test_backward.py
new file mode 100644
index 0000000000000..05fdb572c79de
--- /dev/null
+++ b/test/legacy_test/test_backward.py
@@ -0,0 +1,106 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle.base import backward
+
+
+class BackwardNet:
+    """
+    Abstract Base Class.
+    All Net inherited this Class should implement two functions:
+        build_model: build net to test the logic of backward
+        init_data: fake input data to test all programs.
+    """
+
+    def __init__(self):
+        self.stop_gradient_grad_vars = set()
+        self.no_grad_vars = set()
+        self.params_names = set()
+        self.op_path = []
+
+    def build_model(self):
+        """
+        Build net to test the logic of backward.
+        :return: loss
+        """
+        raise NotImplementedError
+
+    def init_data(self):
+        """
+        Fake input data to test all programs.
+        :return: dict, {'var_name': var_data}
+        """
+        raise NotImplementedError
+
+
+# TODO(Aurelius84): add conditional network test
+class ConditionalNet(BackwardNet):
+    def __init__(self):
+        super().__init__()
+
+
+class TestBackwardUninitializedVariable(unittest.TestCase):
+    """this case is found in yolov5 while to_static.
+    gradient aggregation may cause sum a invalid variable.
+    """
+
+    def test(self):
+        paddle.enable_static()
+        main_prg, startup_prg = paddle.static.Program(), paddle.static.Program()
+        with paddle.static.program_guard(main_prg, startup_prg):
+            gt = paddle.static.data(name='gt', shape=[4], dtype='float32')
+            x = paddle.static.data(name='x', shape=[2], dtype='float32')
+            gt.stop_gradient = True
+            x.stop_gradient = False
+            gt = gt.reshape([4, 1]).reshape([4])
+            loss = (
+                paddle.nn.functional.binary_cross_entropy(x, gt[:2])
+                + (gt[2:4] * x).sum()
+            )
+            exe = paddle.static.Executor()
+            paddle.base.backward.gradients(loss, [])
+            exe.run(startup_prg)
+            # Optimizer
+            out = exe.run(
+                main_prg,
+                feed={
+                    'gt': np.array([1.0, 1.0, 0.0, 0.0], dtype='float32'),
+                    'x': np.array([0.5, 0.5], dtype='float32'),
+                },
+                fetch_list=[loss],
+            )
+            print(out)
+
+
+class TestStripGradSuffix(unittest.TestCase):
+    def test_strip_grad_suffix(self):
+        cases = (
+            ('x@GRAD', 'x'),
+            ('x@GRAD@GRAD', 'x'),
+            ('x@GRAD@RENAME@1', 'x'),
+            ('x@GRAD_slice_0@GRAD', 'x@GRAD_slice_0'),
+            ('grad/grad/x@GRAD@RENAME@block0@1@GRAD', 'x'),
+        )
+        for input_, desired in cases:
+            self.assertEqual(backward._strip_grad_suffix_(input_), desired)
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_batch_norm_op.py b/test/legacy_test/test_batch_norm_op.py
similarity index 63%
rename from test/deprecated/legacy_test/test_batch_norm_op.py
rename to test/legacy_test/test_batch_norm_op.py
index 63893a4353a3c..445c2082d13cd 100644
--- a/test/deprecated/legacy_test/test_batch_norm_op.py
+++ b/test/legacy_test/test_batch_norm_op.py
@@ -26,7 +26,7 @@
 
 import paddle
 from paddle import base
-from paddle.base import Program, core, program_guard
+from paddle.base import core
 from paddle.base.framework import grad_var_name
 from paddle.pir_utils import test_with_pir_api
 
@@ -545,402 +545,6 @@ def test_check_output(self):
                 )
 
 
-class TestBatchNormOpTraining(unittest.TestCase):
-    def setUp(self):
-        self.use_mkldnn = False
-        self.fuse_with_relu = False
-        self.data_formats = ["NCHW", "NHWC"]
-        self.momentum = 0.9
-        self.use_momentum_variable = False
-        self.epsilon = 0.00001
-        self.init_kernel_type()
-        self.init_test_case()
-
-    def init_test_case(self):
-        self.use_global_stats = False
-        self.no_grad_set = set()
-        self.fetch_list = [
-            'y',
-            'mean',
-            'variance',
-            'saved_mean',
-            'saved_variance',
-            'x@GRAD',
-            'scale@GRAD',
-            'bias@GRAD',
-        ]
-
-    def __assert_close(self, tensor, np_array, msg, atol=1e-4):
-        np.allclose(np.array(tensor), np_array, atol=atol)
-
-    def ref_forward_backward(
-        self,
-        x,
-        y_grad,
-        scale,
-        bias,
-        mean,
-        variance,
-        epsilon,
-        momentum,
-        shape,
-        data_layout,
-    ):
-        # run forward
-        y, saved_mean, var_ref = _reference_training(
-            x, scale, bias, epsilon, data_layout
-        )
-        mean_out = saved_mean * (1.0 - momentum) + momentum * mean
-        variance_out = var_ref * (1.0 - momentum) + momentum * variance
-        saved_variance = 1.0 / np.sqrt(var_ref + epsilon)
-        # run backward
-        x_grad, scale_grad, bias_grad = _reference_grad(
-            x, y_grad, scale, saved_mean, var_ref, epsilon, data_layout
-        )
-
-        return (
-            y,
-            mean_out,
-            variance_out,
-            saved_mean,
-            saved_variance,
-            x_grad,
-            scale_grad,
-            bias_grad,
-        )
-
-    def set_mean_variance(self, scale_shape, x, data_layout):
-        mean, variance = _cal_mean_variance(x, self.epsilon, data_layout)
-        mean_pre = np.zeros(scale_shape).astype(np.float32)
-        variance_pre = np.ones(scale_shape).astype(np.float32)
-        # computing global mean/variance for one step
-        if self.use_global_stats:
-            mom = self.momentum
-            mean = mean * (1.0 - mom) + mom * mean_pre
-            variance = variance * (1.0 - mom) + mom * variance_pre
-        return mean, variance
-
-    def test_forward_backward(self):
-        def test_with_place(place, data_layout, shape):
-            # attr
-            epsilon = self.epsilon
-            momentum = self.momentum
-            if data_layout == "NCHW":
-                n, c, h, w = shape[0], shape[1], shape[2], shape[3]
-            else:
-                n, h, w, c = shape[0], shape[1], shape[2], shape[3]
-            scale_shape = [c]
-
-            np.random.seed(123)
-            x = np.random.random_sample(shape).astype(np.float32)
-            scale = np.random.random_sample(scale_shape).astype(np.float32)
-            bias = np.random.random_sample(scale_shape).astype(np.float32)
-            mean, variance = self.set_mean_variance(scale_shape, x, data_layout)
-            y_grad = np.random.random_sample(shape).astype(np.float32)
-            momentum_var = np.array([momentum]).astype(np.float32)
-
-            (
-                y,
-                mean_out,
-                variance_out,
-                saved_mean,
-                saved_variance,
-                x_grad,
-                scale_grad,
-                bias_grad,
-            ) = self.ref_forward_backward(
-                x,
-                y_grad,
-                scale,
-                bias,
-                mean,
-                variance,
-                epsilon,
-                momentum,
-                shape,
-                data_layout,
-            )
-
-            var_dict = locals()
-            var_dict['y@GRAD'] = y_grad
-            var_dict['x@GRAD'] = x_grad
-            var_dict['scale@GRAD'] = scale_grad
-            var_dict['bias@GRAD'] = bias_grad
-
-            var_names = [
-                'x',
-                'scale',
-                'bias',
-                'mean',
-                'variance',
-                'y',
-                'saved_mean',
-                'saved_variance',
-                'momentum_var',
-            ]
-            ground_truth = {name: var_dict[name] for name in var_names}
-
-            program = base.Program()
-            with base.program_guard(program):
-                block = program.global_block()
-                for name in ground_truth:
-                    block.create_var(
-                        name=name,
-                        dtype='float32',
-                        shape=ground_truth[name].shape,
-                    )
-                inputs = {
-                    "X": block.var('x'),
-                    "Scale": block.var('scale'),
-                    "Bias": block.var('bias'),
-                    "Mean": block.var('mean'),
-                    "Variance": block.var('variance'),
-                }
-                attrs = {
-                    "epsilon": epsilon,
-                    "is_test": False,
-                    "data_layout": data_layout,
-                    "use_mkldnn": self.use_mkldnn,
-                    "fuse_with_relu": self.fuse_with_relu,
-                    "use_global_stats": self.use_global_stats,
-                }
-                if self.use_momentum_variable:
-                    inputs['MomentumTensor'] = block.var('momentum_var')
-                else:
-                    attrs['momentum'] = momentum
-
-                outputs = {
-                    "Y": block.var('y'),
-                    "MeanOut": block.var('mean'),  # share memory
-                    "VarianceOut": block.var('variance'),  # share memory
-                    "SavedMean": block.var('saved_mean'),
-                    "SavedVariance": block.var('saved_variance'),
-                }
-                block.create_var(name="reserve_space", dtype='float32')
-                outputs["ReserveSpace"] = block.var('reserve_space')
-                bn_op = block.append_op(
-                    type="batch_norm",
-                    inputs=inputs,
-                    outputs=outputs,
-                    attrs=attrs,
-                )
-                block.create_var(name='y@GRAD', dtype='float32', shape=y.shape)
-
-                # generate backward op_desc
-                grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
-                    bn_op.desc, self.no_grad_set, []
-                )
-                grad_op_desc = grad_op_desc_list[0]
-                new_op_desc = block.desc.append_op()
-                new_op_desc.copy_from(grad_op_desc)
-                for var_name in grad_op_desc.output_arg_names():
-                    block.desc.var(var_name.encode("ascii"))
-                grad_op_desc.infer_var_type(block.desc)
-                grad_op_desc.infer_shape(block.desc)
-                for arg in grad_op_desc.output_arg_names():
-                    grad_var = block.desc.find_var(arg.encode("ascii"))
-                    grad_var.set_dtype(core.VarDesc.VarType.FP32)
-
-                program._sync_with_cpp()
-
-                exe = base.Executor(place)
-                out = exe.run(
-                    program,
-                    feed={
-                        name: var_dict[name]
-                        for name in [
-                            'x',
-                            'scale',
-                            'bias',
-                            'mean',
-                            'variance',
-                            'y@GRAD',
-                            'momentum_var',
-                        ]
-                    },
-                    fetch_list=self.fetch_list,
-                )
-
-            for id, name in enumerate(self.fetch_list):
-                if name == 'variance':
-                    self.__assert_close(
-                        var_dict[name], out[id], name, atol=1e-3
-                    )
-                    continue
-                self.__assert_close(var_dict[name], out[id], name)
-            print("op test forward passed: ", str(place), data_layout)
-
-        places = [core.CPUPlace()]
-
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-
-        for place in places:
-            for data_format in self.data_formats:
-                test_with_place(place, data_format, [2, 3, 4, 5])
-
-    def init_kernel_type(self):
-        pass
-
-
-class TestBatchNormOpTrainingCase1(TestBatchNormOpTraining):
-    def init_test_case(self):
-        self.use_global_stats = False
-        self.no_grad_set = {'scale@GRAD', 'bias@GRAD'}
-        self.fetch_list = ['y', 'mean', 'variance', 'x@GRAD']
-
-
-class TestBatchNormOpTrainingCase2(TestBatchNormOpTraining):
-    def init_test_case(self):
-        self.use_global_stats = False
-        self.no_grad_set = set()
-        self.fetch_list = [
-            'y',
-            'mean',
-            'variance',
-            'saved_mean',
-            'saved_variance',
-            'x@GRAD',
-            'scale@GRAD',
-            'bias@GRAD',
-        ]
-        os.environ['FLAGS_cudnn_batchnorm_spatial_persistent'] = "1"
-
-
-class TestBatchNormOpTrainingCase3(TestBatchNormOpTraining):
-    def init_test_case(self):
-        self.use_global_stats = False
-        self.no_grad_set = {'x@GRAD'}
-        self.fetch_list = ['y', 'mean', 'variance', 'scale@GRAD', 'bias@GRAD']
-
-
-class TestBatchNormOpTrainingMomentumVariable(TestBatchNormOpTraining):
-    def init_test_case(self):
-        self.use_momentum_variable = True
-        self.use_global_stats = False
-        self.no_grad_set = set()
-        self.fetch_list = [
-            'y',
-            'mean',
-            'variance',
-            'saved_mean',
-            'saved_variance',
-            'x@GRAD',
-            'scale@GRAD',
-            'bias@GRAD',
-        ]
-
-
-class TestBatchNormOpFreezeStatsTraining(TestBatchNormOpTraining):
-    def init_test_case(self):
-        self.use_global_stats = True
-        self.no_grad_set = set()
-        self.fetch_list = [
-            'y',
-            'mean',
-            'variance',
-            'x@GRAD',
-            'scale@GRAD',
-            'bias@GRAD',
-        ]
-
-    def reference_grad(self, x, y_grad, scale, mean, var, epsilon, data_format):
-        if data_format == "NCHW":
-            x = np.transpose(x, (0, 2, 3, 1))
-            y_grad = np.transpose(y_grad, (0, 2, 3, 1))
-
-        x_grad = scale * y_grad / np.sqrt(var + epsilon)
-        grad_scale = np.sum(
-            y_grad * (x - mean) / np.sqrt(var + epsilon), axis=(0, 1, 2)
-        )
-        grad_offset = np.sum(y_grad, axis=(0, 1, 2))
-
-        # transfer back to N, C, H, W
-        if data_format == "NCHW":
-            x_grad = np.transpose(x_grad, (0, 3, 1, 2))
-            x = np.transpose(x, (0, 3, 1, 2))
-            y_grad = np.transpose(y_grad, (0, 3, 1, 2))
-
-        return x_grad, grad_scale, grad_offset
-
-    def ref_forward_backward(
-        self,
-        x,
-        y_grad,
-        scale,
-        bias,
-        mean,
-        variance,
-        epsilon,
-        momentum,
-        shape,
-        data_layout,
-    ):
-        if data_layout != "NCHW" and data_layout != "NHWC":
-            raise ValueError("Unknown data order.")
-
-        if data_layout == "NCHW":
-            x = np.transpose(x, (0, 2, 3, 1))
-
-        # run normalizaton
-        normalized = (x - mean) / np.sqrt(variance + epsilon)
-        y = normalized * scale + bias
-
-        # transfer back to N, C, H, W
-        if data_layout == "NCHW":
-            x = np.transpose(x, (0, 3, 1, 2))
-            y = np.transpose(y, (0, 3, 1, 2))
-
-        mean_out = mean
-        variance_out = variance
-        saved_variance = 1.0 / np.sqrt(variance + epsilon)
-        # run backward
-        x_grad, scale_grad, bias_grad = self.reference_grad(
-            x, y_grad, scale, mean, variance, epsilon, data_layout
-        )
-
-        return (
-            y,
-            mean_out,
-            variance_out,
-            mean,
-            saved_variance,
-            x_grad,
-            scale_grad,
-            bias_grad,
-        )
-
-
-class TestBatchNormOpFreezeStatsAndScaleBiasTraining(
-    TestBatchNormOpFreezeStatsTraining
-):
-    def init_test_case(self):
-        self.use_global_stats = True
-        self.no_grad_set = {'scale@GRAD', 'bias@GRAD'}
-        self.fetch_list = ['y', 'mean', 'variance', 'x@GRAD']
-
-
-class TestBatchNormOpError(unittest.TestCase):
-    def test_errors(self):
-        with program_guard(Program(), Program()):
-            # the input of batch_norm must be Variable.
-            x1 = base.create_lod_tensor(
-                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], base.CPUPlace()
-            )
-            self.assertRaises(TypeError, paddle.static.nn.batch_norm, x1)
-
-            # the input dtype of batch_norm must be float16 or float32 or float64
-            # float16 only can be set on GPU place
-            x2 = paddle.static.data(
-                name='x2', shape=[-1, 3, 4, 5, 6], dtype="int32"
-            )
-            self.assertRaises(TypeError, paddle.static.nn.batch_norm, x2)
-
-            # the first dimension of input for batch_norm must between [2d, 5d].
-            x3 = paddle.static.data("", shape=[0], dtype="float32")
-            self.assertRaises(ValueError, paddle.static.nn.batch_norm, x3)
-
-
 class TestDygraphBatchNormAPIError(unittest.TestCase):
     @test_with_pir_api
     def test_errors(self):
diff --git a/test/deprecated/legacy_test/test_bce_loss.py b/test/legacy_test/test_bce_loss.py
similarity index 100%
rename from test/deprecated/legacy_test/test_bce_loss.py
rename to test/legacy_test/test_bce_loss.py
diff --git a/test/deprecated/legacy_test/test_bicubic_interp_op.py b/test/legacy_test/test_bicubic_interp_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_bicubic_interp_op.py
rename to test/legacy_test/test_bicubic_interp_op.py
diff --git a/test/deprecated/legacy_test/test_bilinear_interp_op.py b/test/legacy_test/test_bilinear_interp_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_bilinear_interp_op.py
rename to test/legacy_test/test_bilinear_interp_op.py
diff --git a/test/legacy_test/test_bilinear_tensor_product_op.py b/test/legacy_test/test_bilinear_tensor_product_op.py
new file mode 100644
index 0000000000000..8a74e5c2bdfbf
--- /dev/null
+++ b/test/legacy_test/test_bilinear_tensor_product_op.py
@@ -0,0 +1,57 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from op_test import OpTest
+
+import paddle
+from paddle import base
+
+
+class TestBilinearTensorProductOp(OpTest):
+    def setUp(self):
+        self.op_type = "bilinear_tensor_product"
+        self.python_api = paddle.nn.functional.bilinear
+        batch_size = 6
+        size0 = 5
+        size1 = 4
+        size2 = 5
+        dtype = "float32" if base.core.is_compiled_with_rocm() else "float64"
+        a = np.random.random((batch_size, size0)).astype(dtype)
+        b = np.random.random((batch_size, size1)).astype(dtype)
+        w = np.random.random((size2, size0, size1)).astype(dtype)
+        bias = np.random.random((1, size2)).astype(dtype)
+        output = np.zeros((batch_size, size2)).astype(dtype)
+        for i in range(size2):
+            w_i = w[i, :, :]
+            output[:, i] = np.sum(np.matmul(a, w_i) * b, axis=1)
+        self.inputs = {
+            'X': a,
+            'Y': b,
+            'Weight': w,
+            'Bias': bias,
+        }
+        self.outputs = {'Out': output + bias}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y', 'Weight', 'Bias'], 'Out')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/legacy_test/test_block_diag.py b/test/legacy_test/test_block_diag.py
new file mode 100644
index 0000000000000..842f360f33c4b
--- /dev/null
+++ b/test/legacy_test/test_block_diag.py
@@ -0,0 +1,95 @@
+#   Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import scipy
+
+import paddle
+from paddle import base
+
+
+class TestBlockDiagError(unittest.TestCase):
+    def test_errors(self):
+        def test_type_error():
+            A = np.array([[1, 2], [3, 4]])
+            B = np.array([[5, 6], [7, 8]])
+            C = np.array([[9, 10], [11, 12]])
+            with paddle.static.program_guard(base.Program()):
+                out = paddle.block_diag([A, B, C])
+
+        self.assertRaises(TypeError, test_type_error)
+
+        def test_dime_error():
+            A = paddle.to_tensor([[[1, 2], [3, 4]]])
+            B = paddle.to_tensor([[[5, 6], [7, 8]]])
+            C = paddle.to_tensor([[[9, 10], [11, 12]]])
+            with paddle.static.program_guard(base.Program()):
+                out = paddle.block_diag([A, B, C])
+
+        self.assertRaises(ValueError, test_dime_error)
+
+
+class TestBlockDiag(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2024)
+        self.type_list = ['int32', 'int64', 'float32', 'float64']
+        self.place = [('cpu', paddle.CPUPlace())] + (
+            [('gpu', paddle.CUDAPlace(0))]
+            if paddle.is_compiled_with_cuda()
+            else []
+        )
+
+    def test_dygraph(self):
+        paddle.disable_static()
+        for device, place in self.place:
+            paddle.set_device(device)
+            for i in self.type_list:
+                A = np.random.randn(2, 3).astype(i)
+                B = np.random.randn(2).astype(i)
+                C = np.random.randn(4, 1).astype(i)
+                s_out = scipy.linalg.block_diag(A, B, C)
+
+                A_tensor = paddle.to_tensor(A)
+                B_tensor = paddle.to_tensor(B)
+                C_tensor = paddle.to_tensor(C)
+                out = paddle.block_diag([A_tensor, B_tensor, C_tensor])
+                np.testing.assert_allclose(out.numpy(), s_out)
+
+    def test_static(self):
+        paddle.enable_static()
+        for device, place in self.place:
+            paddle.set_device(device)
+            for i in self.type_list:
+                A = np.random.randn(2, 3).astype(i)
+                B = np.random.randn(2).astype(i)
+                C = np.random.randn(4, 1).astype(i)
+                s_out = scipy.linalg.block_diag(A, B, C)
+
+                with paddle.static.program_guard(paddle.static.Program()):
+                    A_tensor = paddle.static.data('A', [2, 3], i)
+                    B_tensor = paddle.static.data('B', [2], i)
+                    C_tensor = paddle.static.data('C', [4, 1], i)
+                    out = paddle.block_diag([A_tensor, B_tensor, C_tensor])
+                    exe = paddle.static.Executor(place)
+                    res = exe.run(
+                        feed={'A': A, 'B': B, 'C': C},
+                        fetch_list=[out],
+                    )
+                    np.testing.assert_allclose(res[0], s_out)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_bmm_op.py b/test/legacy_test/test_bmm_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_bmm_op.py
rename to test/legacy_test/test_bmm_op.py
diff --git a/test/deprecated/legacy_test/test_cast_op.py b/test/legacy_test/test_cast_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_cast_op.py
rename to test/legacy_test/test_cast_op.py
diff --git a/test/deprecated/legacy_test/test_channel_shuffle.py b/test/legacy_test/test_channel_shuffle.py
similarity index 100%
rename from test/deprecated/legacy_test/test_channel_shuffle.py
rename to test/legacy_test/test_channel_shuffle.py
diff --git a/test/legacy_test/test_cholesky_op.py b/test/legacy_test/test_cholesky_op.py
index d98596fc29c89..25fc0f9365299 100644
--- a/test/legacy_test/test_cholesky_op.py
+++ b/test/legacy_test/test_cholesky_op.py
@@ -121,14 +121,14 @@ def func(self, place):
                 for i in range(len(out)):
                     yi = out[i]
                     dy = paddle.static.data(
-                        name='dys_%s' % i,
+                        name=f'dys_{i}',
                         shape=yi.shape,
                         dtype=root_data.dtype,
                     )
                     dy.stop_gradient = False
                     dy.persistable = True
                     value = np.zeros(yi.shape, dtype=root_data.dtype)
-                    feeds.update({'dys_%s' % i: value})
+                    feeds.update({f'dys_{i}': value})
                     dys.append(dy)
                 fetch_list = base.gradients(out, root, dys)
             grad_check(
diff --git a/test/legacy_test/test_collective_api_base.py b/test/legacy_test/test_collective_api_base.py
index fa31fe1e16b54..dfc5c36a7eb5a 100644
--- a/test/legacy_test/test_collective_api_base.py
+++ b/test/legacy_test/test_collective_api_base.py
@@ -201,7 +201,7 @@ def setUp(self):
         self._trainers = 2
         self._ps_endpoints = f"127.0.0.1:{self._find_free_port()},127.0.0.1:{self._find_free_port()}"
         self._python_interp = sys.executable
-        self._master_endpoints = "127.0.0.1:%s" % (self._find_free_port())
+        self._master_endpoints = f"127.0.0.1:{self._find_free_port()}"
 
         self.temp_dir = tempfile.TemporaryDirectory()
 
@@ -305,15 +305,15 @@ def _run_cluster(self, model_file, envs):
 
         tr0_out, tr0_err = tr0_proc.communicate()
         tr1_out, tr1_err = tr1_proc.communicate()
-        sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err)
-        sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err)
+        sys.stderr.write(f'trainer 0 stderr: {tr0_err}\n')
+        sys.stderr.write(f'trainer 1 stderr: {tr1_err}\n')
         # close trainer file
         tr0_pipe.close()
         tr1_pipe.close()
         with open(path0, "r") as f:
-            sys.stderr.write('trainer 0 stderr file: %s\n' % f.read())
+            sys.stderr.write(f'trainer 0 stderr file: {f.read()}\n')
         with open(path1, "r") as f:
-            sys.stderr.write('trainer 1 stderr file: %s\n' % f.read())
+            sys.stderr.write(f'trainer 1 stderr file: {f.read()}\n')
 
         def load_and_remove(path):
             with open(path, 'rb') as f:
diff --git a/test/legacy_test/test_collective_base.py b/test/legacy_test/test_collective_base.py
index b11b992bcd5f8..07573f6ce7e00 100644
--- a/test/legacy_test/test_collective_base.py
+++ b/test/legacy_test/test_collective_base.py
@@ -232,8 +232,8 @@ def _run_cluster(self, model_file, envs):
 
         tr0_out, tr0_err = tr0_proc.communicate()
         tr1_out, tr1_err = tr1_proc.communicate()
-        sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err)
-        sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err)
+        sys.stderr.write(f'trainer 0 stderr: {tr0_err}\n')
+        sys.stderr.write(f'trainer 1 stderr: {tr1_err}\n')
         # close trainer file
         tr0_pipe.close()
         tr1_pipe.close()
diff --git a/test/deprecated/legacy_test/test_complex_abs.py b/test/legacy_test/test_complex_abs.py
similarity index 100%
rename from test/deprecated/legacy_test/test_complex_abs.py
rename to test/legacy_test/test_complex_abs.py
diff --git a/test/deprecated/legacy_test/test_complex_op.py b/test/legacy_test/test_complex_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_complex_op.py
rename to test/legacy_test/test_complex_op.py
diff --git a/test/deprecated/legacy_test/test_complex_variable.py b/test/legacy_test/test_complex_variable.py
similarity index 100%
rename from test/deprecated/legacy_test/test_complex_variable.py
rename to test/legacy_test/test_complex_variable.py
diff --git a/test/deprecated/legacy_test/test_complex_view_op.py b/test/legacy_test/test_complex_view_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_complex_view_op.py
rename to test/legacy_test/test_complex_view_op.py
diff --git a/test/deprecated/legacy_test/test_conj_op.py b/test/legacy_test/test_conj_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_conj_op.py
rename to test/legacy_test/test_conj_op.py
diff --git a/test/legacy_test/test_conv2d_op.py b/test/legacy_test/test_conv2d_op.py
index a3bfa75d1225f..b0b0d0abe2d96 100644
--- a/test/legacy_test/test_conv2d_op.py
+++ b/test/legacy_test/test_conv2d_op.py
@@ -34,14 +34,14 @@ def conv2d_forward_naive(
 ):
     if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]:
         raise ValueError(
-            "Unknown Attr(padding_algorithm): '%s'. "
-            "It can only be 'SAME' or 'VALID'." % str(padding_algorithm)
+            f"Unknown Attr(padding_algorithm): '{str(padding_algorithm)}'. "
+            "It can only be 'SAME' or 'VALID'."
         )
 
     if data_format not in ["NCHW", "NHWC"]:
         raise ValueError(
-            "Unknown Attr(data_format): '%s' ."
-            "It can only be 'NCHW' or 'NHWC'." % str(data_format)
+            f"Unknown Attr(data_format): '{str(data_format)}' ."
+            "It can only be 'NCHW' or 'NHWC'."
         )
 
     channel_last = data_format == "NHWC"
diff --git a/test/deprecated/legacy_test/test_conv2d_op_depthwise_conv.py b/test/legacy_test/test_conv2d_op_depthwise_conv.py
similarity index 100%
rename from test/deprecated/legacy_test/test_conv2d_op_depthwise_conv.py
rename to test/legacy_test/test_conv2d_op_depthwise_conv.py
diff --git a/test/legacy_test/test_conv2d_transpose_op.py b/test/legacy_test/test_conv2d_transpose_op.py
index 36796adfdaec2..dd14afecf09ec 100644
--- a/test/legacy_test/test_conv2d_transpose_op.py
+++ b/test/legacy_test/test_conv2d_transpose_op.py
@@ -37,8 +37,8 @@ def conv2dtranspose_forward_naive(input_, filter_, attrs):
     padding_algorithm = attrs['padding_algorithm']
     if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]:
         raise ValueError(
-            "Unknown Attr(padding_algorithm): '%s'. "
-            "It can only be 'SAME' or 'VALID'." % str(padding_algorithm)
+            f"Unknown Attr(padding_algorithm): '{str(padding_algorithm)}'. "
+            "It can only be 'SAME' or 'VALID'."
         )
 
     if attrs['data_format'] == 'NHWC':
diff --git a/test/deprecated/legacy_test/test_conv2d_transpose_op_depthwise_conv.py b/test/legacy_test/test_conv2d_transpose_op_depthwise_conv.py
similarity index 100%
rename from test/deprecated/legacy_test/test_conv2d_transpose_op_depthwise_conv.py
rename to test/legacy_test/test_conv2d_transpose_op_depthwise_conv.py
diff --git a/test/legacy_test/test_conv3d_op.py b/test/legacy_test/test_conv3d_op.py
index 143deb493c756..a41580c7b0445 100644
--- a/test/legacy_test/test_conv3d_op.py
+++ b/test/legacy_test/test_conv3d_op.py
@@ -37,14 +37,14 @@ def conv3d_forward_naive(
 ):
     if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]:
         raise ValueError(
-            "Unknown Attr(padding_algorithm): '%s'. "
-            "It can only be 'SAME' or 'VALID'." % str(padding_algorithm)
+            f"Unknown Attr(padding_algorithm): '{str(padding_algorithm)}'. "
+            "It can only be 'SAME' or 'VALID'."
         )
 
     if data_format not in ["NCDHW", "NDHWC"]:
         raise ValueError(
-            "Unknown Attr(data_format): '%s' ."
-            "It can only be 'NCDHW' or 'NDHWC'." % str(data_format)
+            f"Unknown Attr(data_format): '{str(data_format)}' ."
+            "It can only be 'NCDHW' or 'NDHWC'."
         )
 
     channel_last = data_format == "NDHWC"
diff --git a/test/legacy_test/test_conv3d_transpose_op.py b/test/legacy_test/test_conv3d_transpose_op.py
index 78d88d53ff500..9e6f3445eaf99 100644
--- a/test/legacy_test/test_conv3d_transpose_op.py
+++ b/test/legacy_test/test_conv3d_transpose_op.py
@@ -42,8 +42,8 @@ def conv3dtranspose_forward_naive(input_, filter_, attrs):
     padding_algorithm = attrs['padding_algorithm']
     if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]:
         raise ValueError(
-            "Unknown Attr(padding_algorithm): '%s'. "
-            "It can only be 'SAME' or 'VALID'." % str(padding_algorithm)
+            f"Unknown Attr(padding_algorithm): '{str(padding_algorithm)}'. "
+            "It can only be 'SAME' or 'VALID'."
         )
 
     if attrs['data_format'] == 'NHWC':
diff --git a/test/legacy_test/test_conv3d_transpose_part2_op.py b/test/legacy_test/test_conv3d_transpose_part2_op.py
new file mode 100644
index 0000000000000..da75a5720a80d
--- /dev/null
+++ b/test/legacy_test/test_conv3d_transpose_part2_op.py
@@ -0,0 +1,104 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import unittest
+
+sys.path.append("../../legacy_test")
+from test_conv3d_transpose_op import (
+    TestConv3DTransposeOp,
+    create_test_cudnn_bf16_class,
+    create_test_cudnn_fp16_class,
+)
+
+
+class TestWithSymmetricPad_NHWC(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [2, 5, 5, 5, 3]  # NDHWC
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+        self.data_format = 'NHWC'
+
+
+class TestWithAsymmetricPad_NHWC(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 0, 1, 0, 1, 2]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [2, 5, 5, 5, 3]  # NDHWC
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+        self.data_format = 'NHWC'
+
+
+class TestWithGroups_NHWC(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.check_no_filter = True
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 2
+        self.input_size = [2, 5, 5, 5, 4]  # NDHWC
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 3, 3, 3, 3]
+        self.data_format = 'NHWC'
+
+
+class TestWithStride_NHWC(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [2, 2, 2]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [2, 5, 5, 5, 3]  # NCDHW
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+        self.data_format = 'NHWC'
+
+
+class TestWithDilation_NHWC(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.check_no_input = True
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [2, 2, 2]
+        self.groups = 1
+        self.input_size = [2, 5, 5, 5, 3]  # NCDHW
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+        self.data_format = 'NHWC'
+
+
+# ----------------Conv3DTransposeCUDNN fp16----------------
+create_test_cudnn_fp16_class(TestWithSymmetricPad_NHWC)
+create_test_cudnn_fp16_class(TestWithAsymmetricPad_NHWC)
+create_test_cudnn_fp16_class(TestWithGroups_NHWC)
+create_test_cudnn_fp16_class(TestWithStride_NHWC)
+create_test_cudnn_fp16_class(TestWithDilation_NHWC)
+
+
+# ----------------Conv3DTransposeCUDNN bf16----------------
+create_test_cudnn_bf16_class(TestWithSymmetricPad_NHWC)
+create_test_cudnn_bf16_class(TestWithAsymmetricPad_NHWC)
+create_test_cudnn_bf16_class(TestWithGroups_NHWC)
+create_test_cudnn_bf16_class(TestWithStride_NHWC)
+create_test_cudnn_bf16_class(TestWithDilation_NHWC)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_conv_nn_grad.py b/test/legacy_test/test_conv_nn_grad.py
similarity index 58%
rename from test/deprecated/legacy_test/test_conv_nn_grad.py
rename to test/legacy_test/test_conv_nn_grad.py
index 40152a181f1c6..58461aefcd9ab 100644
--- a/test/deprecated/legacy_test/test_conv_nn_grad.py
+++ b/test/legacy_test/test_conv_nn_grad.py
@@ -26,23 +26,6 @@
 
 
 class TestConvDoubleGradCheck(unittest.TestCase):
-    @prog_scope()
-    def func(self, place):
-        shape = [2, 4, 3, 3]
-        eps = 0.005
-        dtype = np.float32 if base.core.is_compiled_with_rocm() else np.float64
-        x = paddle.static.data('x', shape, dtype)
-        y = paddle.static.nn.conv2d(x, 2, 1, groups=1, bias_attr=False)
-        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
-
-        w = base.default_main_program().global_block().all_parameters()
-        w_arr = []
-        for p in w:
-            w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps
-        )
-
     @test_with_pir_api
     @prog_scope()
     def func_pir(self, place):
@@ -66,28 +49,10 @@ def test_grad(self):
         if core.is_compiled_with_cuda():
             places.append(base.CUDAPlace(0))
         for p in places:
-            self.func(p)
             self.func_pir(p)
 
 
 class TestConvDoubleGradCheckTest0(unittest.TestCase):
-    @prog_scope()
-    def func(self, place):
-        shape = [2, 4, 3, 3]
-        eps = 0.005
-        dtype = np.float32 if base.core.is_compiled_with_rocm() else np.float64
-        x = paddle.static.data('x', shape, dtype)
-        y = paddle.static.nn.conv2d(x, 2, 1, bias_attr=False)
-        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
-
-        w = base.default_main_program().global_block().all_parameters()
-        w_arr = []
-        for p in w:
-            w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps
-        )
-
     @test_with_pir_api
     @prog_scope()
     def func_pir(self, place):
@@ -111,28 +76,10 @@ def test_grad(self):
         if core.is_compiled_with_cuda():
             places.append(base.CUDAPlace(0))
         for p in places:
-            self.func(p)
             self.func_pir(p)
 
 
 class TestConvDoubleGradCheckTest1(unittest.TestCase):
-    @prog_scope()
-    def func(self, place):
-        shape = [2, 3, 3, 3]
-        eps = 0.005
-        dtype = np.float32 if base.core.is_compiled_with_rocm() else np.float64
-        x = paddle.static.data('x', shape, dtype)
-        y = paddle.static.nn.conv2d(x, 2, 1, padding=1, bias_attr=False)
-        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
-
-        w = base.default_main_program().global_block().all_parameters()
-        w_arr = []
-        for p in w:
-            w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps
-        )
-
     @test_with_pir_api
     @prog_scope()
     def func_pir(self, place):
@@ -156,28 +103,10 @@ def test_grad(self):
         if core.is_compiled_with_cuda():
             places.append(base.CUDAPlace(0))
         for p in places:
-            self.func(p)
             self.func_pir(p)
 
 
 class TestConv3DDoubleGradCheck(unittest.TestCase):
-    @prog_scope()
-    def func(self, place):
-        shape = [2, 4, 3, 4, 2]
-        eps = 0.005
-        dtype = np.float32 if base.core.is_compiled_with_rocm() else np.float64
-        x = paddle.static.data('x', shape, dtype)
-        y = paddle.static.nn.conv3d(x, 2, 1, bias_attr=False)
-        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
-
-        w = base.default_main_program().global_block().all_parameters()
-        w_arr = []
-        for p in w:
-            w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps
-        )
-
     @test_with_pir_api
     @prog_scope()
     def func_pir(self, place):
@@ -201,28 +130,10 @@ def test_grad(self):
         if core.is_compiled_with_cuda():
             places.append(base.CUDAPlace(0))
         for p in places:
-            self.func(p)
             self.func_pir(p)
 
 
 class TestConv3DDoubleGradCheckTest1(unittest.TestCase):
-    @prog_scope()
-    def func(self, place):
-        shape = [2, 4, 5, 3, 2]
-        eps = 0.005
-        dtype = np.float32 if base.core.is_compiled_with_rocm() else np.float64
-        x = paddle.static.data('x', shape, dtype)
-        y = paddle.static.nn.conv3d(x, 2, 1, padding=1, bias_attr=False)
-        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
-
-        w = base.default_main_program().global_block().all_parameters()
-        w_arr = []
-        for p in w:
-            w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps
-        )
-
     @test_with_pir_api
     @prog_scope()
     def func_pir(self, place):
@@ -246,35 +157,10 @@ def test_grad(self):
         if core.is_compiled_with_cuda():
             places.append(base.CUDAPlace(0))
         for p in places:
-            self.func(p)
             self.func_pir(p)
 
 
 class TestConv2DoubleGradCheck_AsyPadding(unittest.TestCase):
-    @prog_scope()
-    def func(self, place):
-        shape = [2, 2, 3, 3]
-        eps = 0.005
-        dtype = np.float32 if base.core.is_compiled_with_rocm() else np.float64
-        x = paddle.static.data('x', shape, dtype)
-        y = paddle.static.nn.conv2d(
-            input=x,
-            num_filters=2,
-            filter_size=1,
-            padding=[1, 0, 0, 1],
-            bias_attr=False,
-            use_cudnn=True,
-        )
-        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
-
-        w = base.default_main_program().global_block().all_parameters()
-        w_arr = []
-        for p in w:
-            w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps
-        )
-
     @test_with_pir_api
     @prog_scope()
     def func_pir(self, place):
@@ -298,35 +184,10 @@ def test_grad(self):
         if core.is_compiled_with_cuda():
             places.append(base.CUDAPlace(0))
         for p in places:
-            self.func(p)
             self.func_pir(p)
 
 
 class TestConv2DoubleGradCheck_PaddingSAME(unittest.TestCase):
-    @prog_scope()
-    def func(self, place):
-        shape = [2, 2, 3, 3]
-        eps = 0.005
-        dtype = np.float32 if base.core.is_compiled_with_rocm() else np.float64
-        x = paddle.static.data('x', shape, dtype)
-        y = paddle.static.nn.conv2d(
-            input=x,
-            num_filters=2,
-            filter_size=1,
-            padding="SAME",
-            bias_attr=False,
-            use_cudnn=True,
-        )
-        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
-
-        w = base.default_main_program().global_block().all_parameters()
-        w_arr = []
-        for p in w:
-            w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps
-        )
-
     @test_with_pir_api
     @prog_scope()
     def func_pir(self, place):
@@ -350,35 +211,10 @@ def test_grad(self):
         if core.is_compiled_with_cuda():
             places.append(base.CUDAPlace(0))
         for p in places:
-            self.func(p)
             self.func_pir(p)
 
 
 class TestConv2DoubleGradCheck_PaddingVALID(unittest.TestCase):
-    @prog_scope()
-    def func(self, place):
-        shape = [2, 2, 3, 3]
-        eps = 0.005
-        dtype = np.float32 if base.core.is_compiled_with_rocm() else np.float64
-        x = paddle.static.data('x', shape, dtype)
-        y = paddle.static.nn.conv2d(
-            input=x,
-            num_filters=2,
-            filter_size=1,
-            padding="VALID",
-            bias_attr=False,
-            use_cudnn=True,
-        )
-        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
-
-        w = base.default_main_program().global_block().all_parameters()
-        w_arr = []
-        for p in w:
-            w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps
-        )
-
     @test_with_pir_api
     @prog_scope()
     def func_pir(self, place):
@@ -402,37 +238,10 @@ def test_grad(self):
         if core.is_compiled_with_cuda():
             places.append(base.CUDAPlace(0))
         for p in places:
-            self.func(p)
             self.func_pir(p)
 
 
 class TestConv2DoubleGradCheck_ChannelLast(unittest.TestCase):
-    @prog_scope()
-    def func(self, place):
-        shape = [2, 2, 3, 3]
-        eps = 0.005
-        dtype = np.float32 if base.core.is_compiled_with_rocm() else np.float64
-        x = paddle.static.data('x', shape, dtype)
-        y = paddle.static.nn.conv2d(
-            input=x,
-            num_filters=2,
-            filter_size=1,
-            padding=[1, 1],
-            bias_attr=False,
-            use_cudnn=True,
-            groups=1,
-            data_format="NHWC",
-        )
-        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
-
-        w = base.default_main_program().global_block().all_parameters()
-        w_arr = []
-        for p in w:
-            w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps
-        )
-
     @test_with_pir_api
     @prog_scope()
     def func_pir(self, place):
@@ -457,37 +266,10 @@ def test_grad(self):
         if core.is_compiled_with_cuda():
             places.append(base.CUDAPlace(0))
         for p in places:
-            self.func(p)
             self.func_pir(p)
 
 
 class TestConv2DoubleGradCheck_ChannelLast_AsyPadding(unittest.TestCase):
-    @prog_scope()
-    def func(self, place):
-        shape = [2, 2, 3, 3]
-        eps = 0.005
-        dtype = np.float32 if base.core.is_compiled_with_rocm() else np.float64
-        x = paddle.static.data('x', shape, dtype)
-        y = paddle.static.nn.conv2d(
-            input=x,
-            num_filters=2,
-            filter_size=1,
-            padding=[1, 0, 1, 0],
-            bias_attr=False,
-            use_cudnn=True,
-            groups=1,
-            data_format="NHWC",
-        )
-        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
-
-        w = base.default_main_program().global_block().all_parameters()
-        w_arr = []
-        for p in w:
-            w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps
-        )
-
     @test_with_pir_api
     @prog_scope()
     def func_pir(self, place):
@@ -512,35 +294,10 @@ def test_grad(self):
         if core.is_compiled_with_cuda():
             places.append(base.CUDAPlace(0))
         for p in places:
-            self.func(p)
             self.func_pir(p)
 
 
 class TestConv3DDoubleGradCheck_AsyPadding(unittest.TestCase):
-    @prog_scope()
-    def func(self, place):
-        shape = [2, 2, 2, 2, 2]
-        eps = 0.005
-        dtype = np.float32 if base.core.is_compiled_with_rocm() else np.float64
-        x = paddle.static.data('x', shape, dtype)
-        y = paddle.static.nn.conv3d(
-            input=x,
-            num_filters=2,
-            filter_size=1,
-            padding=[1, 0, 0, 1, 1, 2],
-            bias_attr=False,
-            use_cudnn=True,
-        )
-        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
-
-        w = base.default_main_program().global_block().all_parameters()
-        w_arr = []
-        for p in w:
-            w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps
-        )
-
     @test_with_pir_api
     @prog_scope()
     def func_pir(self, place):
@@ -564,36 +321,10 @@ def test_grad(self):
         if core.is_compiled_with_cuda():
             places.append(base.CUDAPlace(0))
         for p in places:
-            self.func(p)
             self.func_pir(p)
 
 
 class TestConv3DoubleGradCheck_PaddingSAME(unittest.TestCase):
-    @prog_scope()
-    def func(self, place):
-        shape = [2, 2, 2, 2, 2]
-        eps = 0.005
-        dtype = np.float32 if base.core.is_compiled_with_rocm() else np.float64
-        x = paddle.static.data('x', shape, dtype)
-        y = paddle.static.nn.conv3d(
-            input=x,
-            num_filters=2,
-            filter_size=1,
-            padding="SAME",
-            groups=1,
-            bias_attr=False,
-            use_cudnn=True,
-        )
-        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
-
-        w = base.default_main_program().global_block().all_parameters()
-        w_arr = []
-        for p in w:
-            w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps
-        )
-
     @test_with_pir_api
     @prog_scope()
     def func_pir(self, place):
@@ -617,35 +348,10 @@ def test_grad(self):
         if core.is_compiled_with_cuda():
             places.append(base.CUDAPlace(0))
         for p in places:
-            self.func(p)
             self.func_pir(p)
 
 
 class TestConv3DoubleGradCheck_PaddingVALID(unittest.TestCase):
-    @prog_scope()
-    def func(self, place):
-        shape = [2, 2, 3, 3, 2]
-        eps = 0.005
-        dtype = np.float32 if base.core.is_compiled_with_rocm() else np.float64
-        x = paddle.static.data('x', shape, dtype)
-        y = paddle.static.nn.conv3d(
-            input=x,
-            num_filters=2,
-            filter_size=1,
-            padding="VALID",
-            bias_attr=False,
-            use_cudnn=True,
-        )
-        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
-
-        w = base.default_main_program().global_block().all_parameters()
-        w_arr = []
-        for p in w:
-            w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps
-        )
-
     @test_with_pir_api
     @prog_scope()
     def func_pir(self, place):
@@ -669,37 +375,10 @@ def test_grad(self):
         if core.is_compiled_with_cuda():
             places.append(base.CUDAPlace(0))
         for p in places:
-            self.func(p)
             self.func_pir(p)
 
 
 class TestConv3DDoubleGradCheck_ChannelLast(unittest.TestCase):
-    @prog_scope()
-    def func(self, place):
-        shape = [2, 2, 2, 2, 3]
-        eps = 0.005
-        dtype = np.float32 if base.core.is_compiled_with_rocm() else np.float64
-        x = paddle.static.data('x', shape, dtype)
-        y = paddle.static.nn.conv3d(
-            input=x,
-            num_filters=2,
-            filter_size=1,
-            padding=[1, 1, 1],
-            bias_attr=False,
-            use_cudnn=True,
-            groups=1,
-            data_format="NDHWC",
-        )
-        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
-
-        w = base.default_main_program().global_block().all_parameters()
-        w_arr = []
-        for p in w:
-            w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps
-        )
-
     @test_with_pir_api
     @prog_scope()
     def func_pir(self, place):
@@ -724,37 +403,10 @@ def test_grad(self):
         if core.is_compiled_with_cuda():
             places.append(base.CUDAPlace(0))
         for p in places:
-            self.func(p)
             self.func_pir(p)
 
 
 class TestConv3DDoubleGradCheck_ChannelLast_AsyPadding(unittest.TestCase):
-    @prog_scope()
-    def func(self, place):
-        shape = [2, 2, 2, 2, 3]
-        eps = 0.005
-        dtype = np.float32 if base.core.is_compiled_with_rocm() else np.float64
-        x = paddle.static.data('x', shape, dtype)
-        y = paddle.static.nn.conv3d(
-            input=x,
-            num_filters=2,
-            filter_size=1,
-            padding=[1, 0, 1, 0, 1, 0],
-            bias_attr=False,
-            use_cudnn=True,
-            groups=1,
-            data_format="NDHWC",
-        )
-        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
-
-        w = base.default_main_program().global_block().all_parameters()
-        w_arr = []
-        for p in w:
-            w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps
-        )
-
     @test_with_pir_api
     @prog_scope()
     def func_pir(self, place):
@@ -779,35 +431,10 @@ def test_grad(self):
         if core.is_compiled_with_cuda():
             places.append(base.CUDAPlace(0))
         for p in places:
-            self.func(p)
             self.func_pir(p)
 
 
 class TestDepthWiseConvDoubleGradCheck(unittest.TestCase):
-    @prog_scope()
-    def func(self, place):
-        shape = [2, 4, 3, 3]
-        eps = 0.005
-        dtype = np.float32 if base.core.is_compiled_with_rocm() else np.float64
-        x = paddle.static.data('x', shape, dtype)
-
-        # condition of depthwise conv:
-        # use_cudnn == False
-        # groups == filters
-        # num_filters % num_channels == 0
-        y = paddle.static.nn.conv2d(
-            x, shape[1], 1, groups=shape[1], bias_attr=False, use_cudnn=False
-        )
-        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
-
-        w = base.default_main_program().global_block().all_parameters()
-        w_arr = []
-        for p in w:
-            w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps
-        )
-
     @test_with_pir_api
     @prog_scope()
     def func_pir(self, place):
@@ -832,7 +459,6 @@ def test_grad(self):
         if core.is_compiled_with_cuda():
             places.append(base.CUDAPlace(0))
         for p in places:
-            self.func(p)
             self.func_pir(p)
 
 
diff --git a/test/deprecated/legacy_test/test_copysign_op.py b/test/legacy_test/test_copysign_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_copysign_op.py
rename to test/legacy_test/test_copysign_op.py
diff --git a/test/legacy_test/test_cpuonly_launch.sh b/test/legacy_test/test_cpuonly_launch.sh
index 1c35166cf4434..8048e2697167e 100644
--- a/test/legacy_test/test_cpuonly_launch.sh
+++ b/test/legacy_test/test_cpuonly_launch.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,7 +22,7 @@ function test_launch_cpuonly(){
     else
         if grep -q "CPUONLY" ut.elog; then
             echo "test_launch_cpuonly successfully"
-        else 
+        else
             echo "test_launch_cpuonly failed"
             exit -1
         fi
diff --git a/test/deprecated/legacy_test/test_crop_tensor_op.py b/test/legacy_test/test_crop_tensor_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_crop_tensor_op.py
rename to test/legacy_test/test_crop_tensor_op.py
diff --git a/test/deprecated/legacy_test/test_cross_entropy2_op.py b/test/legacy_test/test_cross_entropy2_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_cross_entropy2_op.py
rename to test/legacy_test/test_cross_entropy2_op.py
diff --git a/test/deprecated/legacy_test/test_cross_entropy_op.py b/test/legacy_test/test_cross_entropy_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_cross_entropy_op.py
rename to test/legacy_test/test_cross_entropy_op.py
diff --git a/test/deprecated/legacy_test/test_cummax_op.py b/test/legacy_test/test_cummax_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_cummax_op.py
rename to test/legacy_test/test_cummax_op.py
diff --git a/test/deprecated/legacy_test/test_cumprod_op.py b/test/legacy_test/test_cumprod_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_cumprod_op.py
rename to test/legacy_test/test_cumprod_op.py
diff --git a/test/deprecated/legacy_test/test_data_norm_op.py b/test/legacy_test/test_data_norm_op.py
similarity index 91%
rename from test/deprecated/legacy_test/test_data_norm_op.py
rename to test/legacy_test/test_data_norm_op.py
index 954c3da834fd7..8c6c4d599f180 100644
--- a/test/deprecated/legacy_test/test_data_norm_op.py
+++ b/test/legacy_test/test_data_norm_op.py
@@ -19,9 +19,7 @@
 from op import Operator
 from op_test import OpTest
 
-import paddle
-from paddle import base
-from paddle.base import Program, core, program_guard
+from paddle.base import core
 
 
 def _reference_testing(x, batch_size, batch_sum, batch_square_sum, slot_dim=-1):
@@ -524,37 +522,5 @@ def test_check_grad(self):
         self.check_grad(['X'], 'Y', no_grad_set=set(), check_dygraph=False)
 
 
-class TestDataNormOpErrorr(unittest.TestCase):
-    def test_errors(self):
-        with program_guard(Program(), Program()):
-            x2 = paddle.static.data(name='x2', shape=[-1, 3, 4], dtype="int32")
-            # self.assertRaises(TypeError, base.data_norm, x2)
-            paddle.static.nn.data_norm(
-                input=x2, param_attr={}, enable_scale_and_shift=True
-            )
-
-            # Test input with dimension 1
-            paddle.enable_static()
-            x3 = paddle.static.data("", shape=[0], dtype="float32")
-            self.assertRaises(ValueError, paddle.static.nn.data_norm, x3)
-
-            # The size of input in data_norm should not be 0.
-            def test_0_size():
-                paddle.enable_static()
-                x = paddle.static.data(name='x', shape=[0, 3], dtype='float32')
-                out = paddle.static.nn.data_norm(x, slot_dim=1)
-                cpu = base.core.CPUPlace()
-                exe = base.Executor(cpu)
-                exe.run(base.default_startup_program())
-                test_program = base.default_main_program().clone(for_test=True)
-                exe.run(
-                    test_program,
-                    fetch_list=out,
-                    feed={'x': np.ones([0, 3]).astype('float32')},
-                )
-
-            self.assertRaises(ValueError, test_0_size)
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/legacy_test/test_deform_conv2d.py b/test/legacy_test/test_deform_conv2d.py
new file mode 100644
index 0000000000000..95e180c373842
--- /dev/null
+++ b/test/legacy_test/test_deform_conv2d.py
@@ -0,0 +1,346 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from unittest import TestCase
+
+import numpy as np
+
+import paddle
+import paddle.nn.initializer as I
+from paddle.pir_utils import test_with_pir_api
+
+
+class TestDeformConv2D(TestCase):
+    batch_size = 4
+    spatial_shape = (5, 5)
+    dtype = "float32"
+
+    def setUp(self):
+        self.in_channels = 2
+        self.out_channels = 5
+        self.kernel_size = [3, 3]
+        self.padding = [0, 0]
+        self.stride = [1, 1]
+        self.dilation = [1, 1]
+        self.deformable_groups = 1
+        self.groups = 1
+        self.no_bias = True
+
+    def prepare(self):
+        np.random.seed(1)
+        paddle.seed(1)
+        if isinstance(self.kernel_size, int):
+            filter_shape = (self.kernel_size,) * 2
+        else:
+            filter_shape = tuple(self.kernel_size)
+        self.filter_shape = filter_shape
+
+        self.weight = np.random.uniform(
+            -1,
+            1,
+            (self.out_channels, self.in_channels // self.groups) + filter_shape,
+        ).astype(self.dtype)
+        if not self.no_bias:
+            self.bias = np.random.uniform(-1, 1, (self.out_channels,)).astype(
+                self.dtype
+            )
+
+        def out_size(
+            in_size, pad_size, dilation_size, kernel_size, stride_size
+        ):
+            return (
+                in_size + 2 * pad_size - (dilation_size * (kernel_size - 1) + 1)
+            ) / stride_size + 1
+
+        out_h = int(
+            out_size(
+                self.spatial_shape[0],
+                self.padding[0],
+                self.dilation[0],
+                self.kernel_size[0],
+                self.stride[0],
+            )
+        )
+        out_w = int(
+            out_size(
+                self.spatial_shape[1],
+                self.padding[1],
+                self.dilation[1],
+                self.kernel_size[1],
+                self.stride[1],
+            )
+        )
+        out_shape = (out_h, out_w)
+
+        self.input_shape = (
+            self.batch_size,
+            self.in_channels,
+        ) + self.spatial_shape
+
+        self.offset_shape = (
+            self.batch_size,
+            self.deformable_groups * 2 * filter_shape[0] * filter_shape[1],
+        ) + out_shape
+
+        self.mask_shape = (
+            self.batch_size,
+            self.deformable_groups * filter_shape[0] * filter_shape[1],
+        ) + out_shape
+
+        self.input = np.random.uniform(-1, 1, self.input_shape).astype(
+            self.dtype
+        )
+
+        self.offset = np.random.uniform(-1, 1, self.offset_shape).astype(
+            self.dtype
+        )
+
+        self.mask = np.random.uniform(-1, 1, self.mask_shape).astype(self.dtype)
+
+    def static_graph_case_dcn(self):
+        main = paddle.static.Program()
+        start = paddle.static.Program()
+        paddle.enable_static()
+        with paddle.static.program_guard(main, start):
+            x = paddle.static.data(
+                "input", (-1, self.in_channels, -1, -1), dtype=self.dtype
+            )
+            offset = paddle.static.data(
+                "offset",
+                (
+                    -1,
+                    self.deformable_groups
+                    * 2
+                    * self.filter_shape[0]
+                    * self.filter_shape[1],
+                    -1,
+                    -1,
+                ),
+                dtype=self.dtype,
+            )
+            mask = paddle.static.data(
+                "mask",
+                (
+                    -1,
+                    self.deformable_groups
+                    * self.filter_shape[0]
+                    * self.filter_shape[1],
+                    -1,
+                    -1,
+                ),
+                dtype=self.dtype,
+            )
+
+            y_v1 = paddle.vision.ops.DeformConv2D(
+                in_channels=self.in_channels,
+                out_channels=self.out_channels,
+                kernel_size=self.filter_shape,
+                stride=self.stride,
+                padding=self.padding,
+                dilation=self.dilation,
+                groups=self.groups,
+                deformable_groups=self.deformable_groups,
+                weight_attr=I.Assign(self.weight),
+                bias_attr=False if self.no_bias else I.Assign(self.bias),
+            )(x, offset, None)
+
+            y_v2 = paddle.vision.ops.DeformConv2D(
+                in_channels=self.in_channels,
+                out_channels=self.out_channels,
+                kernel_size=self.filter_shape,
+                stride=self.stride,
+                padding=self.padding,
+                dilation=self.dilation,
+                groups=self.groups,
+                deformable_groups=self.deformable_groups,
+                weight_attr=I.Assign(self.weight),
+                bias_attr=False if self.no_bias else I.Assign(self.bias),
+            )(x, offset, mask)
+
+        exe = paddle.static.Executor(self.place)
+        exe.run(start)
+        out_v1, out_v2 = exe.run(
+            main,
+            feed={
+                "input": self.input,
+                "offset": self.offset,
+                "mask": self.mask,
+            },
+            fetch_list=[y_v1, y_v2],
+        )
+        return out_v1, out_v2
+
+    def dygraph_case_dcn(self):
+        paddle.disable_static()
+        x = paddle.to_tensor(self.input)
+        offset = paddle.to_tensor(self.offset)
+        mask = paddle.to_tensor(self.mask)
+
+        bias = None if self.no_bias else paddle.to_tensor(self.bias)
+
+        deform_conv2d = paddle.vision.ops.DeformConv2D(
+            in_channels=self.in_channels,
+            out_channels=self.out_channels,
+            kernel_size=self.kernel_size,
+            stride=self.stride,
+            padding=self.padding,
+            dilation=self.dilation,
+            deformable_groups=self.deformable_groups,
+            groups=self.groups,
+            weight_attr=I.Assign(self.weight),
+            bias_attr=False if self.no_bias else I.Assign(self.bias),
+        )
+
+        y_v1 = deform_conv2d(x, offset)
+        y_v2 = deform_conv2d(x, offset, mask)
+
+        out_v1 = y_v1.numpy()
+        out_v2 = y_v2.numpy()
+
+        return out_v1, out_v2
+
+    @test_with_pir_api
+    def _test_identity(self):
+        self.prepare()
+        static_dcn_v1, static_dcn_v2 = self.static_graph_case_dcn()
+        dy_dcn_v1, dy_dcn_v2 = self.dygraph_case_dcn()
+        np.testing.assert_array_almost_equal(static_dcn_v1, dy_dcn_v1)
+        np.testing.assert_array_almost_equal(static_dcn_v2, dy_dcn_v2)
+
+    def test_identity(self):
+        self.place = paddle.CPUPlace()
+        self._test_identity()
+
+        if paddle.is_compiled_with_cuda():
+            self.place = paddle.CUDAPlace(0)
+            self._test_identity()
+
+
+# testcases for DeformConv2D
+class TestDeformConv2DWithPadding(TestDeformConv2D):
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 5
+        self.kernel_size = [3, 3]
+        self.padding = [2, 2]
+        self.stride = [1, 1]
+        self.dilation = [1, 1]
+        self.deformable_groups = 1
+        self.groups = 1
+        self.no_bias = True
+
+
+class TestDeformConv2DWithBias(TestDeformConv2D):
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 5
+        self.kernel_size = [3, 3]
+        self.padding = [2, 2]
+        self.stride = [1, 1]
+        self.dilation = [1, 1]
+        self.deformable_groups = 1
+        self.groups = 1
+        self.no_bias = False
+
+
+class TestDeformConv2DWithAsynPadding(TestDeformConv2D):
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 5
+        self.kernel_size = [3, 3]
+        self.padding = [1, 2]
+        self.stride = [1, 1]
+        self.dilation = [1, 1]
+        self.deformable_groups = 1
+        self.groups = 1
+        self.no_bias = False
+
+
+class TestDeformConv2DWithDilation(TestDeformConv2D):
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 5
+        self.kernel_size = [3, 3]
+        self.padding = [1, 1]
+        self.stride = [1, 1]
+        self.dilation = [3, 3]
+        self.deformable_groups = 1
+        self.groups = 1
+        self.no_bias = False
+
+
+class TestDeformConv2DWithStride(TestDeformConv2D):
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 5
+        self.kernel_size = [3, 3]
+        self.padding = [1, 1]
+        self.stride = [2, 2]
+        self.dilation = [1, 1]
+        self.deformable_groups = 1
+        self.groups = 1
+        self.no_bias = False
+
+
+class TestDeformConv2DWithDeformable_Groups(TestDeformConv2D):
+    def setUp(self):
+        self.in_channels = 5
+        self.out_channels = 5
+        self.kernel_size = [3, 3]
+        self.padding = [1, 1]
+        self.stride = [1, 1]
+        self.dilation = [1, 1]
+        self.deformable_groups = 5
+        self.groups = 1
+        self.no_bias = False
+
+
+class TestDeformConv2DWithGroups(TestDeformConv2D):
+    def setUp(self):
+        self.in_channels = 5
+        self.out_channels = 5
+        self.kernel_size = [3, 3]
+        self.padding = [1, 1]
+        self.stride = [1, 1]
+        self.dilation = [1, 1]
+        self.deformable_groups = 1
+        self.groups = 5
+        self.no_bias = False
+
+
+class TestDeformConv2DError(unittest.TestCase):
+    @test_with_pir_api
+    def test_input_error(self):
+        def test_input_rank_error():
+            paddle.enable_static()
+            x = paddle.static.data(name='error_x_1', shape=[0], dtype='float32')
+            offset = paddle.static.data(
+                name='error_offset_1', shape=[0], dtype='float32'
+            )
+            mask = paddle.static.data(
+                name='error_mask_1', shape=[0, 0, 0], dtype='float32'
+            )
+            out = paddle.vision.ops.DeformConv2D(
+                in_channels=0,
+                out_channels=0,
+                kernel_size=0,
+                deformable_groups=0,
+            )(x, offset, mask)
+
+        self.assertRaises(AssertionError, test_input_rank_error)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_deformable_conv_op.py b/test/legacy_test/test_deformable_conv_op.py
similarity index 85%
rename from test/deprecated/legacy_test/test_deformable_conv_op.py
rename to test/legacy_test/test_deformable_conv_op.py
index 63d939cc85626..23b49f4f93606 100644
--- a/test/deprecated/legacy_test/test_deformable_conv_op.py
+++ b/test/legacy_test/test_deformable_conv_op.py
@@ -372,73 +372,6 @@ def init_test_case(self):
 
 
 class TestModulatedDeformableConvInvalidInput(unittest.TestCase):
-    def test_error(self):
-        def test_invalid_input():
-            paddle.enable_static()
-            input = [1, 3, 32, 32]
-            offset = paddle.static.data(
-                name='offset', shape=[None, 3, 32, 32], dtype='float32'
-            )
-            mask = paddle.static.data(
-                name='mask', shape=[None, 3, 32, 32], dtype='float32'
-            )
-            loss = paddle.static.nn.common.deformable_conv(
-                input, offset, mask, num_filters=4, filter_size=1
-            )
-
-        self.assertRaises(TypeError, test_invalid_input)
-
-        def test_invalid_offset():
-            paddle.enable_static()
-            input = paddle.static.data(
-                name='input', shape=[None, 3, 32, 32], dtype='int32'
-            )
-            offset = paddle.static.data(
-                name='offset', shape=[None, 3, 32, 32], dtype='float32'
-            )
-            mask = paddle.static.data(
-                name='mask', shape=[None, 3, 32, 32], dtype='float32'
-            )
-            loss = paddle.static.nn.common.deformable_conv(
-                input, offset, mask, num_filters=4, filter_size=1
-            )
-
-        self.assertRaises(TypeError, test_invalid_offset)
-
-        def test_invalid_filter():
-            paddle.enable_static()
-            input = paddle.static.data(
-                name='input_filter', shape=[None, 3, 32, 32], dtype='float32'
-            )
-            offset = paddle.static.data(
-                name='offset_filter', shape=[None, 3, 32, 32], dtype='float32'
-            )
-            mask = paddle.static.data(
-                name='mask_filter', shape=[None, 3, 32, 32], dtype='float32'
-            )
-            loss = paddle.static.nn.common.deformable_conv(
-                input, offset, mask, num_filters=4, filter_size=0
-            )
-
-        self.assertRaises(ValueError, test_invalid_filter)
-
-        def test_invalid_groups():
-            paddle.enable_static()
-            input = paddle.static.data(
-                name='input_groups', shape=[1, 1, 1, 1], dtype='float32'
-            )
-            offset = paddle.static.data(
-                name='offset_groups', shape=[1, 1], dtype='float32'
-            )
-            mask = paddle.static.data(
-                name='mask_groups', shape=[1], dtype='float32'
-            )
-            paddle.static.nn.deform_conv2d(
-                input, offset, mask, 1, 1, padding=1, groups=0
-            )
-
-        self.assertRaises(ValueError, test_invalid_groups)
-
     @test_with_pir_api
     def test_error_api(self):
         def test_invalid_input():
diff --git a/test/deprecated/legacy_test/test_deformable_conv_v1_op.py b/test/legacy_test/test_deformable_conv_v1_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_deformable_conv_v1_op.py
rename to test/legacy_test/test_deformable_conv_v1_op.py
diff --git a/test/deprecated/legacy_test/test_determinant_op.py b/test/legacy_test/test_determinant_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_determinant_op.py
rename to test/legacy_test/test_determinant_op.py
diff --git a/test/legacy_test/test_device_guard.py b/test/legacy_test/test_device_guard.py
new file mode 100644
index 0000000000000..9d53982992ab7
--- /dev/null
+++ b/test/legacy_test/test_device_guard.py
@@ -0,0 +1,106 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle
+
+paddle.enable_static()
+
+
+def execute(main_program, startup_program):
+    if paddle.is_compiled_with_cuda():
+        place = paddle.CUDAPlace(0)
+    else:
+        place = paddle.CPUPlace()
+    exe = paddle.static.Executor(place)
+    exe.run(startup_program)
+    exe.run(main_program)
+
+
+def get_valid_warning_num(warning, w):
+    num = 0
+    for i in range(len(w)):
+        if warning in str(w[i].message):
+            num += 1
+    return num
+
+
+class TestDeviceGuard(unittest.TestCase):
+    def test_cpu_only_op(self):
+        main_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        with paddle.static.program_guard(main_program, startup_program):
+            x = paddle.full(
+                shape=[2, 255, 13, 13], fill_value=0.3, dtype='float32'
+            )
+            gt_box = paddle.full(
+                shape=[2, 6, 4], fill_value=0.5, dtype='float32'
+            )
+            gt_label = paddle.full(shape=[2, 6], fill_value=1.0, dtype='int32')
+            gt_score = paddle.full(
+                shape=[2, 6], fill_value=0.5, dtype='float32'
+            )
+            anchors = [
+                10,
+                13,
+                16,
+                30,
+                33,
+                23,
+                30,
+                61,
+                62,
+                45,
+                59,
+                119,
+                116,
+                90,
+                156,
+                198,
+                373,
+                326,
+            ]
+            anchor_mask = [0, 1, 2]
+            with paddle.static.device_guard("gpu"):
+                # yolo_loss only has cpu kernel, so its cpu kernel will be executed
+                loss = paddle.vision.ops.yolo_loss(
+                    x=x,
+                    gt_box=gt_box,
+                    gt_label=gt_label,
+                    gt_score=gt_score,
+                    anchors=anchors,
+                    anchor_mask=anchor_mask,
+                    class_num=80,
+                    ignore_thresh=0.7,
+                    downsample_ratio=32,
+                )
+
+        execute(main_program, startup_program)
+
+    def test_error(self):
+        def device_attr():
+            with paddle.static.device_guard("cpu1"):
+                out = paddle.full(shape=[1], fill_value=0.2, dtype='float32')
+
+        def device_attr2():
+            with paddle.static.device_guard("cpu:1"):
+                out = paddle.full(shape=[1], fill_value=0.2, dtype='float32')
+
+        self.assertRaises(ValueError, device_attr)
+        self.assertRaises(ValueError, device_attr2)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_diagonal_op.py b/test/legacy_test/test_diagonal_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_diagonal_op.py
rename to test/legacy_test/test_diagonal_op.py
diff --git a/test/legacy_test/test_dist_base.py b/test/legacy_test/test_dist_base.py
index 0abf18fe42c87..143f7e1ee8e62 100755
--- a/test/legacy_test/test_dist_base.py
+++ b/test/legacy_test/test_dist_base.py
@@ -1040,7 +1040,7 @@ def __free_port():
             ) as s:
                 s.bind(('', 0))
                 print_to_err(
-                    type(self).__name__, "socket name: %s" % s.getsockname()[1]
+                    type(self).__name__, f"socket name: {s.getsockname()[1]}"
                 )
                 return s.getsockname()[1]
 
@@ -1479,10 +1479,9 @@ def _get_nccl2_trainer_cmd(
     def _run_cluster_gloo(
         self, model, envs, update_method, check_error_log, log_name
     ):
-        assert update_method == "gloo", (
-            "_run_cluster_gloo must have update_method: gloo, but get %s"
-            % update_method
-        )
+        assert (
+            update_method == "gloo"
+        ), f"_run_cluster_gloo must have update_method: gloo, but get {update_method}"
         assert (
             not self._use_hallreduce
         ), "_run_cluster_gloo must have _use_hallreduce = false"
@@ -1551,9 +1550,7 @@ def _run_cluster_nccl2(
             if DIST_UT_PORT == 0:
                 # NOTE(wangxi). hallreduce test must use 4cards after nccl>=2.7
                 for i in range(0, 4):
-                    self._ps_endpoints += "127.0.0.1:%s," % (
-                        self._find_free_port()
-                    )
+                    self._ps_endpoints += f"127.0.0.1:{self._find_free_port()},"
             else:
                 for i in range(0, 4):
                     self._ps_endpoints += "127.0.0.1:%s," % (DIST_UT_PORT + i)
diff --git a/test/legacy_test/test_dist_hapi_model.py b/test/legacy_test/test_dist_hapi_model.py
index 03a92d6f3cbc9..e41f5b344a594 100644
--- a/test/legacy_test/test_dist_hapi_model.py
+++ b/test/legacy_test/test_dist_hapi_model.py
@@ -70,9 +70,11 @@ def start_local_trainers(
     procs = []
     for t in pod.trainers:
         proc_env = {
-            "FLAGS_selected_gpus": "%s" % ",".join([str(g) for g in t.gpus]),
+            "FLAGS_selected_gpus": "{}".format(
+                ",".join([str(g) for g in t.gpus])
+            ),
             "PADDLE_TRAINER_ID": "%d" % t.rank,
-            "PADDLE_CURRENT_ENDPOINT": "%s" % t.endpoint,
+            "PADDLE_CURRENT_ENDPOINT": f"{t.endpoint}",
             "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
             "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()),
             "FLAGS_dynamic_static_unified_comm": "0",
diff --git a/test/deprecated/legacy_test/test_eigh_op.py b/test/legacy_test/test_eigh_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_eigh_op.py
rename to test/legacy_test/test_eigh_op.py
diff --git a/test/deprecated/legacy_test/test_elementwise_heaviside_op.py b/test/legacy_test/test_elementwise_heaviside_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_elementwise_heaviside_op.py
rename to test/legacy_test/test_elementwise_heaviside_op.py
diff --git a/test/deprecated/legacy_test/test_elementwise_mul_op.py b/test/legacy_test/test_elementwise_mul_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_elementwise_mul_op.py
rename to test/legacy_test/test_elementwise_mul_op.py
diff --git a/test/deprecated/legacy_test/test_elementwise_pow_op.py b/test/legacy_test/test_elementwise_pow_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_elementwise_pow_op.py
rename to test/legacy_test/test_elementwise_pow_op.py
diff --git a/test/deprecated/legacy_test/test_executor_and_use_program_cache.py b/test/legacy_test/test_executor_and_use_program_cache.py
similarity index 66%
rename from test/deprecated/legacy_test/test_executor_and_use_program_cache.py
rename to test/legacy_test/test_executor_and_use_program_cache.py
index c43b58f027ca1..15df67914856d 100644
--- a/test/deprecated/legacy_test/test_executor_and_use_program_cache.py
+++ b/test/legacy_test/test_executor_and_use_program_cache.py
@@ -25,66 +25,6 @@
 from paddle.pir_utils import test_with_pir_api
 
 
-class TestExecutor(unittest.TestCase):
-    def test_mul(self):
-        main_program = base.Program()
-        startup_program = base.Program()
-        with base.program_guard(main_program, startup_program):
-            a = paddle.static.data(name='a', shape=[-1, 784], dtype='float32')
-            b = paddle.static.data(name='b', shape=[784, 100], dtype='float32')
-            a.desc.set_need_check_feed(False)
-            b.desc.set_need_check_feed(False)
-            output = paddle.matmul(x=a, y=b)
-
-        # Compute with numpy
-        a_np = np.random.random((100, 784)).astype('float32')
-        b_np = np.random.random((784, 100)).astype('float32')
-        out_np = np.dot(a_np, b_np)
-
-        place = paddle.CPUPlace()
-        exe = base.Executor(place)
-
-        def _train(use_program_cache, max_iters=1):
-            import time
-
-            run_time = 0.0
-            for i in range(max_iters):
-                begin = time.time()
-                outs = exe.run(
-                    program=main_program,
-                    feed={'a': a_np, 'b': b_np},
-                    fetch_list=[output],
-                    use_program_cache=use_program_cache,
-                )
-                end = time.time()
-                run_time += end - begin
-                out = outs[0]
-                self.assertEqual((100, 100), out.shape)
-                np.testing.assert_allclose(out, out_np, rtol=1e-05)
-            return run_time
-
-        max_iters = 3
-        run_time_with_cache = _train(
-            use_program_cache=True, max_iters=max_iters
-        )
-        print("run time with program cache: %f" % run_time_with_cache)
-
-        run_time_without_cache = _train(
-            use_program_cache=False, max_iters=max_iters
-        )
-        print("run time without program cache: %f" % run_time_without_cache)
-
-        run_time_with_cache = _train(
-            use_program_cache=True, max_iters=max_iters
-        )
-        print("run time with program cache: %f" % run_time_with_cache)
-
-        run_time_with_cache = _train(
-            use_program_cache=True, max_iters=max_iters
-        )
-        print("run time with program cache: %f" % run_time_with_cache)
-
-
 class ExecutorPaddingRNNTest(PaddingRNNTestBase):
     def train_and_save_inference_program(
         self, rnn_model="static", use_program_cache=True
diff --git a/test/deprecated/legacy_test/test_expand_v2_op.py b/test/legacy_test/test_expand_v2_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_expand_v2_op.py
rename to test/legacy_test/test_expand_v2_op.py
diff --git a/test/legacy_test/test_fc_op.py b/test/legacy_test/test_fc_op.py
new file mode 100644
index 0000000000000..d61c93361097b
--- /dev/null
+++ b/test/legacy_test/test_fc_op.py
@@ -0,0 +1,136 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from op_test import OpTest
+
+SEED = 2020
+
+
+def fc_refer(matrix, with_bias, with_relu=False):
+    in_n, in_c, in_h, in_w = matrix.input.shape
+    w_i, w_o = matrix.weights.shape
+
+    x_data = np.reshape(matrix.input, [in_n, in_c * in_h * in_w])
+    w_data = np.reshape(matrix.weights, [w_i, w_o])
+    b_data = np.reshape(matrix.bias, [1, w_o])
+    result = None
+
+    if with_bias:
+        result = np.dot(x_data, w_data) + b_data
+    else:
+        result = np.dot(x_data, w_data)
+
+    if with_relu:
+        return np.maximum(result, 0)
+    else:
+        return result
+
+
+class MatrixGenerate:
+    def __init__(self, mb, ic, oc, h, w, bias_dims=2):
+        self.input = np.random.random((mb, ic, h, w)).astype("float32")
+        self.weights = np.random.random((ic * h * w, oc)).astype("float32")
+        if bias_dims == 2:
+            self.bias = np.random.random((1, oc)).astype("float32")
+        else:
+            self.bias = np.random.random(oc).astype("float32")
+
+
+class TestFCOp(OpTest):
+    def config(self):
+        self.with_bias = True
+        self.with_relu = True
+        self.matrix = MatrixGenerate(1, 10, 15, 3, 3, 2)
+
+    def setUp(self):
+        self.op_type = "fc"
+        self.config()
+
+        if self.with_bias:
+            self.inputs = {
+                'Input': self.matrix.input,
+                'W': self.matrix.weights,
+                'Bias': self.matrix.bias,
+            }
+        else:
+            self.inputs = {'Input': self.matrix.input, 'W': self.matrix.weights}
+
+        if self.with_relu:
+            activation_type = "relu"
+        else:
+            activation_type = ""
+        self.attrs = {'use_mkldnn': False, 'activation_type': activation_type}
+
+        self.outputs = {
+            'Out': fc_refer(self.matrix, self.with_bias, self.with_relu)
+        }
+
+    def test_check_output(self):
+        self.check_output(check_dygraph=False)
+
+
+class TestFCOpNoBias1(TestFCOp):
+    def config(self):
+        self.with_bias = False
+        self.with_relu = False
+        self.matrix = MatrixGenerate(2, 8, 10, 1, 1, 2)
+
+
+class TestFCOpNoBias2(TestFCOp):
+    def config(self):
+        self.with_bias = False
+        self.with_relu = False
+        self.matrix = MatrixGenerate(4, 5, 6, 2, 2, 1)
+
+
+class TestFCOpNoBias4(TestFCOp):
+    def config(self):
+        self.with_bias = False
+        self.with_relu = False
+        self.matrix = MatrixGenerate(1, 32, 64, 3, 3, 1)
+
+
+class TestFCOpWithBias1(TestFCOp):
+    def config(self):
+        self.with_bias = True
+        self.with_relu = False
+        self.matrix = MatrixGenerate(3, 8, 10, 2, 1, 2)
+
+
+class TestFCOpWithBias2(TestFCOp):
+    def config(self):
+        self.with_bias = True
+        self.with_relu = True
+        self.matrix = MatrixGenerate(4, 5, 6, 2, 2, 1)
+
+
+class TestFCOpWithBias3(TestFCOp):
+    def config(self):
+        self.with_bias = True
+        self.with_relu = True
+        self.matrix = MatrixGenerate(1, 64, 32, 3, 3, 1)
+
+
+class TestFCOpWithPadding(TestFCOp):
+    def config(self):
+        self.with_bias = True
+        self.with_relu = True
+        self.matrix = MatrixGenerate(1, 4, 3, 128, 128, 2)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_fill_any_op.py b/test/legacy_test/test_fill_any_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_fill_any_op.py
rename to test/legacy_test/test_fill_any_op.py
diff --git a/test/deprecated/legacy_test/test_fill_diagonal_tensor_op.py b/test/legacy_test/test_fill_diagonal_tensor_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_fill_diagonal_tensor_op.py
rename to test/legacy_test/test_fill_diagonal_tensor_op.py
diff --git a/test/deprecated/legacy_test/test_flatten_contiguous_range_op.py b/test/legacy_test/test_flatten_contiguous_range_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_flatten_contiguous_range_op.py
rename to test/legacy_test/test_flatten_contiguous_range_op.py
diff --git a/test/legacy_test/test_fleet_launch_async.sh b/test/legacy_test/test_fleet_launch_async.sh
index f50e24f10beca..88a53788719ad 100644
--- a/test/legacy_test/test_fleet_launch_async.sh
+++ b/test/legacy_test/test_fleet_launch_async.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/test/legacy_test/test_fleet_launch_cloud.sh b/test/legacy_test/test_fleet_launch_cloud.sh
index 0d05b73d3566f..08079ea2848cf 100644
--- a/test/legacy_test/test_fleet_launch_cloud.sh
+++ b/test/legacy_test/test_fleet_launch_cloud.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/test/legacy_test/test_fleet_launch_elastic.sh b/test/legacy_test/test_fleet_launch_elastic.sh
index a3e76a564f5b7..07d4dc993f3ae 100644
--- a/test/legacy_test/test_fleet_launch_elastic.sh
+++ b/test/legacy_test/test_fleet_launch_elastic.sh
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -19,11 +19,11 @@ rm -rf log*
 
 pids=`ps -ef | grep "python -m paddle.distributed.launch elastic_demo.[py]" | awk '{print $2}'`
 if [ -n "$pids" ]; then
-    echo $pids | xargs kill -9 
+    echo $pids | xargs kill -9
 fi
 pids=`ps -ef | grep "/usr/bin/python -u elastic_demo.[py]" | awk '{print $2}'`
 if [ -n "$pids" ]; then
-    echo $pids | xargs kill -9 
+    echo $pids | xargs kill -9
 fi
 
 python -m pip install --no-cache-dir etcd3 -i https://mirror.baidu.com/pypi/simple
@@ -102,7 +102,7 @@ check_env() {
         echo "PADDLE_TRAINERS error"
         exit -1
     fi
-    
+
     if grep -q "0-DISTRIBUTED_TRAINER_ENDPOINTS=$DISTRIBUTED_TRAINER_ENDPOINTS" $lw0 && grep -q "1-DISTRIBUTED_TRAINER_ENDPOINTS=$DISTRIBUTED_TRAINER_ENDPOINTS" $lw0; then
         echo "DISTRIBUTED_TRAINER_ENDPOINTS ok"
     else
diff --git a/test/legacy_test/test_fleet_launch_nproc.sh b/test/legacy_test/test_fleet_launch_nproc.sh
index 63fce18683c04..5371b90822e15 100644
--- a/test/legacy_test/test_fleet_launch_nproc.sh
+++ b/test/legacy_test/test_fleet_launch_nproc.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/test/legacy_test/test_fleet_launch_ps.sh b/test/legacy_test/test_fleet_launch_ps.sh
index bfbaf258c86b4..9b81cd4866a62 100644
--- a/test/legacy_test/test_fleet_launch_ps.sh
+++ b/test/legacy_test/test_fleet_launch_ps.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/test/legacy_test/test_fleet_launch_rank_mapping.sh b/test/legacy_test/test_fleet_launch_rank_mapping.sh
index eb84f9f6e847a..abd347664dc01 100755
--- a/test/legacy_test/test_fleet_launch_rank_mapping.sh
+++ b/test/legacy_test/test_fleet_launch_rank_mapping.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/test/legacy_test/test_fleet_run_random_port.sh b/test/legacy_test/test_fleet_run_random_port.sh
index 9ca48f2ab5bb3..bb71f883d30e4 100644
--- a/test/legacy_test/test_fleet_run_random_port.sh
+++ b/test/legacy_test/test_fleet_run_random_port.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/test/legacy_test/test_fleetrun.sh b/test/legacy_test/test_fleetrun.sh
index 710859727d2c9..f04245fcf0c09 100644
--- a/test/legacy_test/test_fleetrun.sh
+++ b/test/legacy_test/test_fleetrun.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/test/deprecated/legacy_test/test_flip.py b/test/legacy_test/test_flip.py
similarity index 100%
rename from test/deprecated/legacy_test/test_flip.py
rename to test/legacy_test/test_flip.py
diff --git a/test/deprecated/legacy_test/test_fmax_op.py b/test/legacy_test/test_fmax_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_fmax_op.py
rename to test/legacy_test/test_fmax_op.py
diff --git a/test/deprecated/legacy_test/test_fmin_op.py b/test/legacy_test/test_fmin_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_fmin_op.py
rename to test/legacy_test/test_fmin_op.py
diff --git a/test/deprecated/legacy_test/test_fold_op.py b/test/legacy_test/test_fold_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_fold_op.py
rename to test/legacy_test/test_fold_op.py
diff --git a/test/deprecated/legacy_test/test_fractional_max_pool2d_op.py b/test/legacy_test/test_fractional_max_pool2d_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_fractional_max_pool2d_op.py
rename to test/legacy_test/test_fractional_max_pool2d_op.py
diff --git a/test/deprecated/legacy_test/test_full_like_op.py b/test/legacy_test/test_full_like_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_full_like_op.py
rename to test/legacy_test/test_full_like_op.py
diff --git a/test/legacy_test/test_functional_conv2d.py b/test/legacy_test/test_functional_conv2d.py
new file mode 100644
index 0000000000000..2f7d18e29566f
--- /dev/null
+++ b/test/legacy_test/test_functional_conv2d.py
@@ -0,0 +1,284 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from unittest import TestCase
+
+import numpy as np
+
+import paddle
+import paddle.base.dygraph as dg
+import paddle.nn.functional as F
+from paddle import base
+
+
+class TestFunctionalConv2DError(TestCase):
+    batch_size = 4
+    spatial_shape = (16, 16)
+    dtype = "float32"
+
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 5
+        self.filter_shape = 3
+        self.padding = "not_valid"
+        self.stride = 1
+        self.dilation = 1
+        self.groups = 1
+        self.no_bias = False
+        self.act = "sigmoid"
+        self.data_format = "NHWC"
+
+    def test_exception(self):
+        self.prepare()
+        with self.assertRaises(ValueError):
+            self.static_graph_case()
+
+    def prepare(self):
+        if isinstance(self.filter_shape, int):
+            filter_shape = (self.filter_shape,) * 2
+        else:
+            filter_shape = tuple(self.filter_shape)
+        self.weight_shape = (
+            self.out_channels,
+            self.in_channels // self.groups,
+        ) + filter_shape
+        self.bias_shape = (self.out_channels,)
+
+    def static_graph_case(self):
+        main = base.Program()
+        start = base.Program()
+        with base.unique_name.guard():
+            with base.program_guard(main, start):
+                self.channel_last = self.data_format == "NHWC"
+                if self.channel_last:
+                    x = x = paddle.static.data(
+                        "input",
+                        (-1, -1, -1, self.in_channels),
+                        dtype=self.dtype,
+                    )
+                else:
+                    x = paddle.static.data(
+                        "input",
+                        (-1, self.in_channels, -1, -1),
+                        dtype=self.dtype,
+                    )
+                weight = paddle.static.data(
+                    "weight", self.weight_shape, dtype=self.dtype
+                )
+                if not self.no_bias:
+                    bias = paddle.static.data(
+                        "bias", self.bias_shape, dtype=self.dtype
+                    )
+                y = F.conv2d(
+                    x,
+                    weight,
+                    None if self.no_bias else bias,
+                    padding=self.padding,
+                    stride=self.stride,
+                    dilation=self.dilation,
+                    groups=self.groups,
+                    data_format=self.data_format,
+                )
+
+
+class TestFunctionalConv2DErrorCase2(TestFunctionalConv2DError):
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 5
+        self.filter_shape = 3
+        self.padding = [[0, 0], [1, 2], [3, 4], [5, 6]]
+        self.stride = 1
+        self.dilation = 1
+        self.groups = 1
+        self.no_bias = False
+        self.act = "sigmoid"
+        self.use_cudnn = False
+        self.data_format = "NCHW"
+
+
+class TestFunctionalConv2DErrorCase3(TestFunctionalConv2DError):
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 4
+        self.filter_shape = 3
+        self.padding = "same"
+        self.stride = 1
+        self.dilation = 1
+        self.groups = 2
+        self.no_bias = False
+        self.act = "sigmoid"
+        self.use_cudnn = False
+        self.data_format = "not_valid"
+
+
+class TestFunctionalConv2DErrorCase4(TestFunctionalConv2DError):
+    def setUp(self):
+        self.in_channels = 4
+        self.out_channels = 3
+        self.filter_shape = 3
+        self.padding = "same"
+        self.stride = 1
+        self.dilation = 1
+        self.groups = 2
+        self.no_bias = False
+        self.act = "sigmoid"
+        self.use_cudnn = False
+        self.data_format = "NCHW"
+
+
+class TestFunctionalConv2DErrorCase7(TestFunctionalConv2DError):
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 5
+        self.filter_shape = 3
+        self.padding = "same"
+        self.stride = 1
+        self.dilation = 1
+        self.groups = 1
+        self.no_bias = False
+        self.act = "sigmoid"
+        self.use_cudnn = True
+        self.data_format = "not_valid"
+
+
+class TestFunctionalConv2DErrorCase8(TestFunctionalConv2DError):
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 5
+        self.filter_shape = 3
+        self.padding = [1, 2, 1, 2, 1]
+        self.stride = 1
+        self.dilation = 1
+        self.groups = 1
+        self.no_bias = False
+        self.act = "sigmoid"
+        self.use_cudnn = True
+        self.data_format = "NCHW"
+
+
+class TestFunctionalConv2DErrorCase9(TestFunctionalConv2DError):
+    def setUp(self):
+        self.in_channels = -5
+        self.out_channels = 5
+        self.filter_shape = 3
+        self.padding = [[0, 0], [0, 0], [3, 2], [1, 2]]
+        self.stride = 1
+        self.dilation = 1
+        self.groups = 1
+        self.no_bias = False
+        self.act = "sigmoid"
+        self.use_cudnn = False
+        self.data_format = "NCHW"
+
+
+class TestFunctionalConv2DErrorCase10(TestFunctionalConv2DError):
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 4
+        self.filter_shape = 3
+        self.padding = "same"
+        self.stride = 1
+        self.dilation = 1
+        self.groups = 2
+        self.no_bias = False
+        self.act = "sigmoid"
+        self.use_cudnn = False
+        self.data_format = "NHWC"
+
+
+class TestFunctionalConv2DErrorCase11(TestFunctionalConv2DError):
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 5
+        self.filter_shape = 3
+        self.padding = 0
+        self.stride = 1
+        self.dilation = 1
+        self.groups = 1
+        self.no_bias = False
+        self.act = "sigmoid"
+        self.use_cudnn = False
+        self.data_format = "NHCW"
+
+
+class TestFunctionalConv2DErrorCase12(TestCase):
+    def setUp(self):
+        self.input = np.array([])
+        self.filter = np.array([])
+        self.num_filters = 0
+        self.filter_size = 0
+        self.bias = None
+        self.padding = 0
+        self.stride = 1
+        self.dilation = 1
+        self.groups = 1
+        self.data_format = "NCHW"
+
+    def dygraph_case(self):
+        with dg.guard():
+            x = paddle.to_tensor(self.input, dtype=paddle.float32)
+            w = paddle.to_tensor(self.filter, dtype=paddle.float32)
+            b = (
+                None
+                if self.bias is None
+                else paddle.to_tensor(self.bias, dtype=paddle.float32)
+            )
+            y = F.conv2d(
+                x,
+                w,
+                b,
+                padding=self.padding,
+                stride=self.stride,
+                dilation=self.dilation,
+                groups=self.groups,
+                data_format=self.data_format,
+            )
+
+    def test_dygraph_exception(self):
+        with self.assertRaises(ValueError):
+            self.dygraph_case()
+
+
+class TestFunctionalConv2DErrorCase13(TestFunctionalConv2DErrorCase12):
+    def setUp(self):
+        self.input = np.random.randn(1, 3, 3, 3)
+        self.filter = np.random.randn(3, 3, 1, 1)
+        self.num_filters = 3
+        self.filter_size = 1
+        self.bias = None
+        self.padding = 0
+        self.stride = 1
+        self.dilation = 1
+        self.groups = 0
+        self.data_format = "NCHW"
+
+
+class TestFunctionalConv2DErrorCase14(TestFunctionalConv2DErrorCase12):
+    def setUp(self):
+        self.input = np.random.randn(0, 0, 0, 0)
+        self.filter = np.random.randn(1, 0, 0, 0)
+        self.num_filters = 0
+        self.filter_size = 0
+        self.bias = None
+        self.padding = 0
+        self.stride = 1
+        self.dilation = 1
+        self.groups = 1
+        self.data_format = "NCHW"
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
diff --git a/test/legacy_test/test_functional_conv3d.py b/test/legacy_test/test_functional_conv3d.py
new file mode 100644
index 0000000000000..bdfd2c7e6116f
--- /dev/null
+++ b/test/legacy_test/test_functional_conv3d.py
@@ -0,0 +1,265 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from unittest import TestCase
+
+import numpy as np
+
+import paddle
+import paddle.base.dygraph as dg
+import paddle.nn.functional as F
+from paddle import base
+
+
+class TestFunctionalConv3DError(TestCase):
+    batch_size = 4
+    spatial_shape = (8, 8, 8)
+    dtype = "float32"
+
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 5
+        self.filter_shape = 3
+        self.padding = "not_valid"
+        self.stride = 1
+        self.dilation = 1
+        self.groups = 1
+        self.no_bias = False
+        self.act = "sigmoid"
+        self.data_format = "NDHWC"
+
+    def test_exception(self):
+        self.prepare()
+        with self.assertRaises(ValueError):
+            self.static_graph_case()
+
+    def prepare(self):
+        if isinstance(self.filter_shape, int):
+            filter_shape = (self.filter_shape,) * 3
+        else:
+            filter_shape = tuple(self.filter_shape)
+        self.weight_shape = (
+            self.out_channels,
+            self.in_channels // self.groups,
+        ) + filter_shape
+        self.bias_shape = (self.out_channels,)
+
+    def static_graph_case(self):
+        main = base.Program()
+        start = base.Program()
+        with base.unique_name.guard():
+            with base.program_guard(main, start):
+                self.channel_last = self.data_format == "NDHWC"
+                if self.channel_last:
+                    x = x = paddle.static.data(
+                        "input",
+                        (-1, -1, -1, -1, self.in_channels),
+                        dtype=self.dtype,
+                    )
+                else:
+                    x = paddle.static.data(
+                        "input",
+                        (-1, self.in_channels, -1, -1, -1),
+                        dtype=self.dtype,
+                    )
+                weight = paddle.static.data(
+                    "weight", self.weight_shape, dtype=self.dtype
+                )
+                if not self.no_bias:
+                    bias = paddle.static.data(
+                        "bias", self.bias_shape, dtype=self.dtype
+                    )
+                y = F.conv3d(
+                    x,
+                    weight,
+                    None if self.no_bias else bias,
+                    padding=self.padding,
+                    stride=self.stride,
+                    dilation=self.dilation,
+                    groups=self.groups,
+                    data_format=self.data_format,
+                )
+
+                if self.act == 'sigmoid':
+                    y = F.sigmoid(y)
+
+
+class TestFunctionalConv3DErrorCase2(TestFunctionalConv3DError):
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 5
+        self.filter_shape = 3
+        self.padding = [[0, 0], [1, 1], [1, 2], [3, 4], [5, 6]]
+        self.stride = 1
+        self.dilation = 1
+        self.groups = 1
+        self.no_bias = False
+        self.act = "sigmoid"
+        self.data_format = "NCDHW"
+
+
+class TestFunctionalConv3DErrorCase3(TestFunctionalConv3DError):
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 4
+        self.filter_shape = 3
+        self.padding = "same"
+        self.stride = 1
+        self.dilation = 1
+        self.groups = 2
+        self.no_bias = False
+        self.act = "sigmoid"
+        self.data_format = "not_valid"
+
+
+class TestFunctionalConv3DErrorCase4(TestFunctionalConv3DError):
+    def setUp(self):
+        self.in_channels = 4
+        self.out_channels = 3
+        self.filter_shape = 3
+        self.padding = "same"
+        self.stride = 1
+        self.dilation = 1
+        self.groups = 2
+        self.no_bias = False
+        self.act = "sigmoid"
+        self.data_format = "NCDHW"
+
+
+class TestFunctionalConv3DErrorCase7(TestFunctionalConv3DError):
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 5
+        self.filter_shape = 3
+        self.padding = "same"
+        self.stride = 1
+        self.dilation = 1
+        self.groups = 1
+        self.no_bias = False
+        self.act = "sigmoid"
+        self.data_format = "not_valid"
+
+
+class TestFunctionalConv3DErrorCase8(TestFunctionalConv3DError):
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 5
+        self.filter_shape = 3
+        self.padding = [1, 2, 1, 2, 1]
+        self.stride = 1
+        self.dilation = 1
+        self.groups = 1
+        self.no_bias = False
+        self.act = "sigmoid"
+        self.data_format = "NCDHW"
+
+
+class TestFunctionalConv3DErrorCase9(TestFunctionalConv3DError):
+    def setUp(self):
+        self.in_channels = -5
+        self.out_channels = 5
+        self.filter_shape = 3
+        self.padding = [[0, 0], [0, 0], [3, 2], [1, 2], [1, 1]]
+        self.stride = 1
+        self.dilation = 1
+        self.groups = 1
+        self.no_bias = False
+        self.act = "sigmoid"
+        self.data_format = "NCDHW"
+
+
+class TestFunctionalConv3DErrorCase10(TestFunctionalConv3DError):
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 4
+        self.filter_shape = 3
+        self.padding = "same"
+        self.stride = 1
+        self.dilation = 1
+        self.groups = 2
+        self.no_bias = False
+        self.act = "sigmoid"
+        self.data_format = "NDHWC"
+
+
+class TestFunctionalConv3DErrorCase11(TestCase):
+    def setUp(self):
+        self.input = np.array([])
+        self.filter = np.array([])
+        self.num_filters = 0
+        self.filter_size = 0
+        self.bias = None
+        self.padding = 0
+        self.stride = 1
+        self.dilation = 1
+        self.groups = 1
+        self.data_format = "NCDHW"
+
+    def dygraph_case(self):
+        with dg.guard():
+            x = paddle.to_tensor(self.input, dtype=paddle.float32)
+            w = paddle.to_tensor(self.filter, dtype=paddle.float32)
+            b = (
+                None
+                if self.bias is None
+                else paddle.to_tensor(self.bias, dtype=paddle.float32)
+            )
+            y = F.conv3d(
+                x,
+                w,
+                b,
+                padding=self.padding,
+                stride=self.stride,
+                dilation=self.dilation,
+                groups=self.groups,
+                data_format=self.data_format,
+            )
+
+    def test_dygraph_exception(self):
+        with self.assertRaises(ValueError):
+            self.dygraph_case()
+
+
+class TestFunctionalConv3DErrorCase12(TestFunctionalConv3DErrorCase11):
+    def setUp(self):
+        self.input = np.random.randn(1, 3, 3, 3, 3)
+        self.filter = np.random.randn(3, 3, 1, 1, 1)
+        self.num_filters = 3
+        self.filter_size = 1
+        self.bias = None
+        self.padding = 0
+        self.stride = 1
+        self.dilation = 1
+        self.groups = 0
+        self.data_format = "NCDHW"
+
+
+class TestFunctionalConv3DErrorCase13(TestFunctionalConv3DErrorCase11):
+    def setUp(self):
+        self.input = np.random.randn(0, 0, 0, 0, 0)
+        self.filter = np.random.randn(1, 0, 0, 0, 0)
+        self.num_filters = 1
+        self.filter_size = 1
+        self.bias = None
+        self.padding = 0
+        self.stride = 1
+        self.dilation = 1
+        self.groups = 1
+        self.data_format = "NCDHW"
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
diff --git a/test/legacy_test/test_fused_elemwise_activation_op.py b/test/legacy_test/test_fused_elemwise_activation_op.py
index b14e86aba9ff8..301985fff8ff6 100644
--- a/test/legacy_test/test_fused_elemwise_activation_op.py
+++ b/test/legacy_test/test_fused_elemwise_activation_op.py
@@ -18,8 +18,6 @@
 import numpy as np
 from op_test import OpTest
 
-from paddle.base import core
-
 #   TestFusedElementwiseActivationOp
 #   TestFusedElementwiseActivationOp_scalar
 #   TestFusedElementwiseActivationOp_scalar2
@@ -32,6 +30,25 @@
 #   TestFusedElementwiseActivationOp_rowwise_add_0
 #   TestFusedElementwiseActivationOp_rowwise_add_1
 #   TestFusedElementwiseActivationOp_channelwise_add
+import paddle
+from paddle.base import core
+
+
+def api_wrapper(
+    x, y, functor_list=[], axis=-1, scale=0.0, save_intermediate_out=False
+):
+    return paddle._legacy_C_ops.fused_elemwise_activation(
+        x,
+        y,
+        "axis",
+        axis,
+        "scale",
+        scale,
+        "save_intermediate_out",
+        save_intermediate_out,
+        "functor_list",
+        functor_list,
+    )
 
 
 def create_test_class(
@@ -40,6 +57,8 @@ def create_test_class(
     class TestFusedElementwiseActivationOp_base(OpTest):
         def setUp(self):
             self.op_type = "fused_elemwise_activation"
+            self.python_api = api_wrapper
+            self.python_out_sig = ['Out']
             self.dtype = dtype
             self.axis = -1
 
diff --git a/test/legacy_test/test_fused_groupnorm.py b/test/legacy_test/test_fused_groupnorm.py
new file mode 100644
index 0000000000000..5dbaa4d5a569d
--- /dev/null
+++ b/test/legacy_test/test_fused_groupnorm.py
@@ -0,0 +1,321 @@
+#  Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import numpy as np
+
+import paddle
+import paddle.nn.functional as F
+from paddle import base
+from paddle.base import core
+from paddle.base.layer_helper import LayerHelper
+
+
+def naive_residual_add(x, residual):
+    return np.add(x, residual)
+
+
+def naive_group_norm(x, scale, bias, epsilon, groups, data_layout):
+    dim = x.ndim
+    if dim == 3:
+        if data_layout == "NHWC":
+            x = np.transpose(x, (0, 2, 1))  # NLC => NCL
+        N, C, L = x.shape
+        G = groups
+        x = x.reshape((N * G, -1))
+        mean = np.mean(x, axis=1, keepdims=True)
+        var = np.var(x, axis=1, keepdims=True)
+        output = (x - mean) / np.sqrt(var + epsilon)
+        output = output.reshape((N, C, L)) * scale.reshape(
+            (-1, 1)
+        ) + bias.reshape((-1, 1))
+        if data_layout == "NHWC":
+            output = np.transpose(output, (0, 2, 1))  # NCL => NLC
+        return [output, mean.reshape((N, G)), var.reshape((N, G))]
+    elif dim == 4:
+        if data_layout == "NHWC":
+            x = np.transpose(x, (0, 3, 1, 2))  # NHWC => NCHW
+        N, C, H, W = x.shape
+        G = groups
+        x = x.reshape((N * G, -1))
+        mean = np.mean(x, axis=1, keepdims=True)
+        var = np.var(x, axis=1, keepdims=True)
+        output = (x - mean) / np.sqrt(var + epsilon)
+        output = output.reshape((N, C, H, W)) * scale.reshape(
+            (-1, 1, 1)
+        ) + bias.reshape((-1, 1, 1))
+        if data_layout == "NHWC":
+            output = np.transpose(output, (0, 2, 3, 1))  # NCHW => NHWC
+        return [output, mean.reshape((N, G)), var.reshape((N, G))]
+    else:
+        if data_layout == "NHWC":
+            x = np.transpose(x, (0, 4, 1, 2, 3))  # NDHWC => NCDHW
+        N, C, D, H, W = x.shape
+        G = groups
+        x = x.reshape((N * G, -1))
+        mean = np.mean(x, axis=1, keepdims=True)
+        var = np.var(x, axis=1, keepdims=True)
+        output = (x - mean) / np.sqrt(var + epsilon)
+        output = output.reshape((N, C, D, H, W)) * scale.reshape(
+            (-1, 1, 1, 1)
+        ) + bias.reshape((-1, 1, 1, 1))
+        if data_layout == "NHWC":
+            output = np.transpose(output, (0, 2, 3, 4, 1))  # NCDHW => NDHWC
+        return [output, mean.reshape((N, G)), var.reshape((N, G))]
+
+
+def naive_residual_biasadd_layer_norm(
+    x, residual, scale, bias, epsilon, groups, data_layout, activation
+):
+    x = x + residual
+    out = naive_group_norm(x, scale, bias, epsilon, groups, data_layout)
+    if activation == "silu":
+        out[0] = F.silu(paddle.to_tensor(out[0])).numpy()
+    return out
+
+
+def add_group_norm_silu_static_wrapper(
+    x, residual, scale, bias, epsilon, groups, data_layout="NHWC", activation=""
+):
+    helper = LayerHelper('add_group_norm_silu', **locals())
+    mean_out = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True
+    )
+    variance_out = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True
+    )
+
+    inputs = {'x': x}
+    if bias is not None:
+        inputs['bias'] = bias
+    if scale is not None:
+        inputs['scale'] = scale
+    if residual is not None:
+        inputs['residual'] = residual
+
+    # create output
+    group_norm_out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    residual_out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type="add_group_norm_silu",
+        inputs=inputs,
+        outputs={
+            "y": group_norm_out,
+            "residual_out": residual_out,
+            "mean": mean_out,
+            "variance": variance_out,
+        },
+        attrs={
+            "epsilon": epsilon,
+            "groups": groups,
+            "data_format": data_layout,
+            "activation": activation,
+        },
+    )
+
+    return group_norm_out, residual_out
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_float16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support the bfloat16",
+)
+class TestGroupNormNHWC_StaticOp(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(20)
+        self.shape = (2, 4, 2, 6)
+        self.r_shape = (1, 1, 1, 6)
+        self.x_np = np.random.uniform(-0.05, 0.05, self.shape)
+        self.residual_np = np.random.uniform(-0.05, 0.05, self.r_shape)
+        self.scale_np = np.random.uniform(-0.05, 0.05, [self.shape[-1]])
+        self.bias_np = np.random.uniform(-0.05, 0.05, [self.shape[-1]])
+        self.epsilon = 1e-5
+        self.groups = 2
+        self.data_layout = 'NHWC'
+        self.activation = ''
+        self.place = paddle.CUDAPlace(0)
+
+    def check_residual_add_groupnorm(
+        self, x_np, scale_np, bias_np, residual_np, activation, dtype
+    ):
+        paddle.disable_static()
+        navie_groupnorm_out = naive_residual_biasadd_layer_norm(
+            x_np,
+            residual_np,
+            scale_np,
+            bias_np,
+            self.epsilon,
+            self.groups,
+            self.data_layout,
+            self.activation,
+        )
+        navie_residual_out = naive_residual_add(x_np, residual_np)
+        paddle.enable_static()
+
+        with paddle.static.program_guard(paddle.static.Program()):
+            x_static = paddle.static.data(
+                name="x_static", shape=self.shape, dtype=dtype
+            )
+            residual_static = paddle.static.data(
+                name="residual_static",
+                shape=self.r_shape,
+                dtype=dtype,
+            )
+
+            scale_static = paddle.static.data(
+                name="scale_static", shape=[self.shape[-1]], dtype=dtype
+            )
+            bias_static = paddle.static.data(
+                name="bias_static", shape=[self.shape[-1]], dtype=dtype
+            )
+            outs = add_group_norm_silu_static_wrapper(
+                x_static,
+                residual_static,
+                scale_static,
+                bias_static,
+                self.epsilon,
+                self.groups,
+                self.data_layout,
+                activation,
+            )
+
+            exe = base.Executor(self.place)
+            out_s = exe.run(
+                feed={
+                    "x_static": x_np.astype(dtype),
+                    "scale_static": scale_np.astype(dtype),
+                    "residual_static": residual_np.astype(dtype),
+                    "bias_static": bias_np.astype(dtype),
+                },
+                fetch_list=[outs],
+            )
+        return (out_s[0], out_s[1]), navie_groupnorm_out, navie_residual_out
+
+    def test_residual_add_groupnorm_fp16(self):
+        if not paddle.is_compiled_with_cuda():
+            return
+        self.dtype = np.float16
+        (
+            paddle_group_list,
+            paddle_naive_group_out,
+            paddle_naive_group_residual,
+        ) = self.check_residual_add_groupnorm(
+            self.x_np.astype(self.dtype),
+            self.scale_np.astype(self.dtype),
+            self.bias_np.astype(self.dtype),
+            self.residual_np.astype(self.dtype),
+            self.activation,
+            self.dtype,
+        )
+        np.testing.assert_allclose(
+            paddle_group_list[1],
+            paddle_naive_group_residual,
+            rtol=1e-5,
+            atol=1e-5,
+        )
+        np.testing.assert_allclose(
+            paddle_group_list[0],
+            paddle_naive_group_out[0],
+            rtol=1e-4,
+            atol=1e-4,
+        )
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_float16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support the bfloat16",
+)
+class TestGroupNormNHWCSilu_StaticOp(TestGroupNormNHWC_StaticOp):
+    def setUp(self):
+        np.random.seed(20)
+        self.shape = (2, 4, 2, 6)
+        self.r_shape = (1, 1, 1, 6)
+        self.x_np = np.random.uniform(-0.05, 0.05, self.shape)
+        self.residual_np = np.random.uniform(-0.05, 0.05, self.r_shape)
+        self.scale_np = np.random.uniform(-0.05, 0.05, [self.shape[-1]])
+        self.bias_np = np.random.uniform(-0.05, 0.05, [self.shape[-1]])
+        self.epsilon = 1e-5
+        self.groups = 2
+        self.data_layout = 'NHWC'
+        self.activation = 'silu'
+        self.place = paddle.CUDAPlace(0)
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_float16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support the bfloat16",
+)
+class TestGroupNormNHWC_StaticOp_1(TestGroupNormNHWC_StaticOp):
+    def setUp(self):
+        np.random.seed(20)
+        self.shape = (2, 4, 2, 6)
+        self.r_shape = (2, 4, 2, 6)
+        self.x_np = np.random.uniform(-0.05, 0.05, self.shape)
+        self.residual_np = np.random.uniform(-0.05, 0.05, self.r_shape)
+        self.scale_np = np.random.uniform(-0.05, 0.05, [self.shape[-1]])
+        self.bias_np = np.random.uniform(-0.05, 0.05, [self.shape[-1]])
+        self.epsilon = 1e-5
+        self.groups = 2
+        self.data_layout = 'NHWC'
+        self.activation = 'silu'
+        self.place = paddle.CUDAPlace(0)
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_float16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support the bfloat16",
+)
+class TestGroupNormNHWCSilu_StaticOp_1(TestGroupNormNHWC_StaticOp):
+    def setUp(self):
+        np.random.seed(20)
+        self.shape = (2, 4, 2, 6)
+        self.r_shape = (2, 4, 2, 6)
+        self.x_np = np.random.uniform(-0.05, 0.05, self.shape)
+        self.residual_np = np.random.uniform(-0.05, 0.05, self.r_shape)
+        self.scale_np = np.random.uniform(-0.05, 0.05, [self.shape[-1]])
+        self.bias_np = np.random.uniform(-0.05, 0.05, [self.shape[-1]])
+        self.epsilon = 1e-5
+        self.groups = 2
+        self.data_layout = 'NHWC'
+        self.activation = 'silu'
+        self.place = paddle.CUDAPlace(0)
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_float16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support the bfloat16",
+)
+class TestGroupNormNHWCSingleC_StaticOp(TestGroupNormNHWC_StaticOp):
+    def setUp(self):
+        np.random.seed(20)
+        self.shape = (2, 4, 2, 6)
+        self.r_shape = (2, 4, 2, 6)
+        self.x_np = np.random.uniform(-0.05, 0.05, self.shape)
+        self.residual_np = np.random.uniform(-0.05, 0.05, self.r_shape)
+        self.scale_np = np.random.uniform(-0.05, 0.05, [self.shape[-1]])
+        self.bias_np = np.random.uniform(-0.05, 0.05, [self.shape[-1]])
+        self.epsilon = 1e-5
+        self.groups = 6
+        self.data_layout = 'NHWC'
+        self.activation = ''
+        self.place = paddle.CUDAPlace(0)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_gammaln_op.py b/test/legacy_test/test_gammaln_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_gammaln_op.py
rename to test/legacy_test/test_gammaln_op.py
diff --git a/test/deprecated/legacy_test/test_gaussian_random_op.py b/test/legacy_test/test_gaussian_random_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_gaussian_random_op.py
rename to test/legacy_test/test_gaussian_random_op.py
diff --git a/test/deprecated/legacy_test/test_graph_send_recv_op.py b/test/legacy_test/test_graph_send_recv_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_graph_send_recv_op.py
rename to test/legacy_test/test_graph_send_recv_op.py
diff --git a/test/deprecated/legacy_test/test_graph_send_ue_recv_op.py b/test/legacy_test/test_graph_send_ue_recv_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_graph_send_ue_recv_op.py
rename to test/legacy_test/test_graph_send_ue_recv_op.py
diff --git a/test/deprecated/legacy_test/test_graph_send_uv_op.py b/test/legacy_test/test_graph_send_uv_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_graph_send_uv_op.py
rename to test/legacy_test/test_graph_send_uv_op.py
diff --git a/test/deprecated/legacy_test/test_grid_sampler_op.py b/test/legacy_test/test_grid_sampler_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_grid_sampler_op.py
rename to test/legacy_test/test_grid_sampler_op.py
diff --git a/test/legacy_test/test_group_norm_op.py b/test/legacy_test/test_group_norm_op.py
index f097df3b0b99c..7a6f57cc61ece 100644
--- a/test/legacy_test/test_group_norm_op.py
+++ b/test/legacy_test/test_group_norm_op.py
@@ -209,7 +209,7 @@ def do_compare_between_place(self):
             gpu_grads,
             inputs_to_check,
             0.005,
-            "Gradient Check On %s" % str(place),
+            f"Gradient Check On {str(place)}",
         )
 
     def test_check_grad(self):
@@ -1748,7 +1748,7 @@ def test_jit_comp(self):
                 fwd_actual[i],
                 rtol=rtol,
                 atol=atol,
-                err_msg='%s jit fwd' % self.places[i],
+                err_msg=f'{self.places[i]} jit fwd',
             )
 
             # TODO: fix the diff between cpu and gpu grad is large in original op
@@ -1762,7 +1762,7 @@ def test_jit_comp(self):
                 rev_actual[i],
                 rtol=rtol,
                 atol=atol,
-                err_msg='%s jit rev' % self.places[i],
+                err_msg=f'{self.places[i]} jit rev',
             )
 
     def test_jit_comp_with_cinn(self):
@@ -1820,7 +1820,7 @@ def test_jit_comp_with_cinn(self):
                 fwd_actual[i],
                 rtol=rtol,  # mean of uniform distribution, scale for avoid random failed
                 atol=atol,
-                err_msg='%s jit_cinn fwd' % self.places[i],
+                err_msg=f'{self.places[i]} jit_cinn fwd',
             )
             # TODO: fix the diff between cpu and gpu grad is large in original op
             # now use larger threshold when testing cpu grads to bypass cpu grad test
@@ -1832,7 +1832,7 @@ def test_jit_comp_with_cinn(self):
                 rev_actual[i],
                 rtol=rtol,  # mean of uniform distribution, scale for avoid random failed
                 atol=atol,
-                err_msg='%s jit_cinn rev' % self.places[i],
+                err_msg=f'{self.places[i]} jit_cinn rev',
             )
             i += 1
 
diff --git a/test/deprecated/legacy_test/test_gru_op.py b/test/legacy_test/test_gru_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_gru_op.py
rename to test/legacy_test/test_gru_op.py
diff --git a/test/deprecated/legacy_test/test_gru_unit_op.py b/test/legacy_test/test_gru_unit_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_gru_unit_op.py
rename to test/legacy_test/test_gru_unit_op.py
diff --git a/test/deprecated/legacy_test/test_gumbel_softmax_op.py b/test/legacy_test/test_gumbel_softmax_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_gumbel_softmax_op.py
rename to test/legacy_test/test_gumbel_softmax_op.py
diff --git a/test/deprecated/legacy_test/test_hinge_loss_op.py b/test/legacy_test/test_hinge_loss_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_hinge_loss_op.py
rename to test/legacy_test/test_hinge_loss_op.py
diff --git a/test/deprecated/legacy_test/test_hsigmoid_op.py b/test/legacy_test/test_hsigmoid_op.py
similarity index 89%
rename from test/deprecated/legacy_test/test_hsigmoid_op.py
rename to test/legacy_test/test_hsigmoid_op.py
index ad3ae81821cdf..f481fc2ebee2f 100644
--- a/test/deprecated/legacy_test/test_hsigmoid_op.py
+++ b/test/legacy_test/test_hsigmoid_op.py
@@ -20,7 +20,6 @@
 
 import paddle
 import paddle.nn.functional as F
-from paddle import base
 from paddle.pir_utils import test_with_pir_api
 
 paddle.enable_static()
@@ -283,91 +282,6 @@ def test_check_output(self):
         self.check_output(check_pir=True)
 
 
-class TestHSigmoidOpWithSparseGrad(unittest.TestCase):
-    def hs_net_conf(self, is_sparse):
-        input_word = paddle.static.data(name="x", shape=[-1, 1], dtype='int64')
-        path_table = paddle.static.data(
-            name='path_table', shape=[-1, 3], dtype='int64'
-        )
-        path_code = paddle.static.data(
-            name='path_code', shape=[-1, 3], dtype='int64'
-        )
-        label = paddle.static.data(name='label', shape=[-1, 1], dtype='int64')
-
-        data_list = [input_word, path_table, path_code, label]
-
-        emb = paddle.static.nn.embedding(
-            input=input_word,
-            is_sparse=is_sparse,
-            size=[3, 3],
-            param_attr=base.ParamAttr(
-                initializer=paddle.nn.initializer.Normal(std=1 / math.sqrt(3))
-            ),
-        )
-
-        loss = paddle.nn.HSigmoidLoss(
-            feature_size=emb.shape[1],
-            num_classes=3,
-            bias_attr=True,
-            is_custom=True,
-            is_sparse=is_sparse,
-        )
-
-        cost = loss(
-            input=emb,
-            label=label,
-            path_table=path_table,
-            path_code=path_code,
-        )
-
-        avg_cost = paddle.mean(cost)
-
-        return avg_cost, data_list
-
-    def training_test(self, is_sparse):
-        with paddle.static.program_guard(
-            paddle.static.Program(), paddle.static.Program()
-        ):
-            paddle.seed(1)
-            start_up = paddle.static.default_startup_program()
-            x = np.arange(6).reshape(6)
-            path_table = np.array([(1, 2, -1), (1, 2, -1)]).astype('int64')
-            path_code = np.array([(1, 0, -1), (0, 0, -1)]).astype('int64')
-            label = np.array([1, 4]).astype('int64')
-
-            loss, data_list = self.hs_net_conf(is_sparse)
-            optimizer = paddle.optimizer.SGD(learning_rate=1e-3)
-            optimizer.minimize(loss)
-
-            main_program = paddle.static.default_main_program()
-            place = base.CPUPlace()
-            feeder = base.DataFeeder(feed_list=data_list, place=place)
-            exe = paddle.static.Executor(place)
-
-            exe.run(start_up)
-            result = []
-            for i in range(10):
-                data = [
-                    (
-                        [[x[i % 2]]],
-                        [list(path_table[i % 2])],
-                        [list(path_code[i % 2])],
-                        [label[i % 2]],
-                    )
-                ]
-
-                loss_val = exe.run(
-                    main_program, feed=feeder.feed(data), fetch_list=[loss]
-                )
-                result.append(loss_val)
-        return result
-
-    def test_hs_grad_with_sparse(self):
-        dense_result = self.training_test(is_sparse=False)
-        sparse_result = self.training_test(is_sparse=True)
-        assert dense_result == sparse_result
-
-
 @skip_check_grad_ci(
     reason="[skip shape check] The huffman tree is structed separately. It will be complicated if use large shape."
 )
diff --git a/test/deprecated/legacy_test/test_huber_loss_op.py b/test/legacy_test/test_huber_loss_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_huber_loss_op.py
rename to test/legacy_test/test_huber_loss_op.py
diff --git a/test/deprecated/legacy_test/test_identity_loss_op.py b/test/legacy_test/test_identity_loss_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_identity_loss_op.py
rename to test/legacy_test/test_identity_loss_op.py
diff --git a/test/deprecated/legacy_test/test_im2sequence_op.py b/test/legacy_test/test_im2sequence_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_im2sequence_op.py
rename to test/legacy_test/test_im2sequence_op.py
diff --git a/test/legacy_test/test_imperative_deepcf.py b/test/legacy_test/test_imperative_deepcf.py
index 301ec4e0a468e..31e94078c7ca8 100644
--- a/test/legacy_test/test_imperative_deepcf.py
+++ b/test/legacy_test/test_imperative_deepcf.py
@@ -188,7 +188,7 @@ def get_data(self):
         )
 
     def load_data(self):
-        sys.stderr.write('loading from %s\n' % self.data_path)
+        sys.stderr.write(f'loading from {self.data_path}\n')
         likes = {}
         num_users = -1
         num_items = -1
@@ -299,7 +299,7 @@ def test_deefcf(self):
                         },
                         fetch_list=[loss],
                     )[0]
-                    sys.stderr.write('static loss %s\n' % static_loss)
+                    sys.stderr.write(f'static loss {static_loss}\n')
 
         with base.dygraph.guard():
             paddle.seed(seed)
diff --git a/test/deprecated/legacy_test/test_imperative_framework.py b/test/legacy_test/test_imperative_framework.py
similarity index 77%
rename from test/deprecated/legacy_test/test_imperative_framework.py
rename to test/legacy_test/test_imperative_framework.py
index 01f6d37eed4b1..b85eeb11df517 100644
--- a/test/deprecated/legacy_test/test_imperative_framework.py
+++ b/test/legacy_test/test_imperative_framework.py
@@ -15,7 +15,6 @@
 import unittest
 
 import numpy as np
-from test_imperative_base import new_program_scope
 
 import paddle
 from paddle import base
@@ -53,21 +52,13 @@ def forward(self, inputs):
 
 
 class TestDygraphFramework(unittest.TestCase):
-    def test_dygraph_backward(self):
-        with new_program_scope():
-            mlp = MLP(input_size=2)
-            var_inp = paddle.static.data("input", shape=[2, 2], dtype="float32")
-            out = mlp(var_inp)
-            try:
-                out.backward()
-                raise AssertionError(
-                    "backward should not be usable in static graph mode"
-                )
-            except AssertionError as e:
-                self.assertTrue(e is not None)
-
     def test_dygraph_to_string(self):
         np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
         with base.dygraph.guard():
             var_inp = paddle.to_tensor(np_inp)
             print(str(var_inp))
+
+
+if __name__ == '__main__':
+    paddle.disable_static()
+    unittest.main()
diff --git a/test/legacy_test/test_imperative_hook_for_layer.py b/test/legacy_test/test_imperative_hook_for_layer.py
index e80a31d47805f..18335bfaf98f1 100644
--- a/test/legacy_test/test_imperative_hook_for_layer.py
+++ b/test/legacy_test/test_imperative_hook_for_layer.py
@@ -18,7 +18,7 @@
 import numpy as np
 
 sys.path.append("../deprecated/legacy_test")
-from test_imperative_lod_tensor_to_selected_rows import SimpleNet
+from test_imperative_lod_tensor_to_selected_rows_deprecated import SimpleNet
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_imperative_layers.py b/test/legacy_test/test_imperative_layers.py
index 9906d3ba0ede0..947ab037ee89b 100644
--- a/test/legacy_test/test_imperative_layers.py
+++ b/test/legacy_test/test_imperative_layers.py
@@ -85,7 +85,9 @@ def test_layer_str(self):
         self.assertEqual(str(module), 'Tanhshrink()')
 
         module = nn.ThresholdedReLU()
-        self.assertEqual(str(module), 'ThresholdedReLU(threshold=1.0)')
+        self.assertEqual(
+            str(module), 'ThresholdedReLU(threshold=1.0, value=0.0)'
+        )
 
         module = nn.LogSigmoid()
         self.assertEqual(str(module), 'LogSigmoid()')
diff --git a/test/deprecated/legacy_test/test_imperative_star_gan_with_gradient_penalty.py b/test/legacy_test/test_imperative_star_gan_with_gradient_penalty.py
similarity index 100%
rename from test/deprecated/legacy_test/test_imperative_star_gan_with_gradient_penalty.py
rename to test/legacy_test/test_imperative_star_gan_with_gradient_penalty.py
diff --git a/test/deprecated/legacy_test/test_index_add_op.py b/test/legacy_test/test_index_add_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_index_add_op.py
rename to test/legacy_test/test_index_add_op.py
diff --git a/test/deprecated/legacy_test/test_index_sample_op.py b/test/legacy_test/test_index_sample_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_index_sample_op.py
rename to test/legacy_test/test_index_sample_op.py
diff --git a/test/deprecated/legacy_test/test_index_select_op.py b/test/legacy_test/test_index_select_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_index_select_op.py
rename to test/legacy_test/test_index_select_op.py
diff --git a/test/legacy_test/test_inference_api.py b/test/legacy_test/test_inference_api.py
new file mode 100644
index 0000000000000..2bbf3ceb24431
--- /dev/null
+++ b/test/legacy_test/test_inference_api.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle
+
+paddle.enable_static()
+import numpy as np
+
+from paddle.base.core import PaddleDType, PaddleTensor
+
+
+class TestInferenceApi(unittest.TestCase):
+    def test_inference_api(self):
+        tensor32 = np.random.randint(10, 20, size=[20, 2]).astype('int32')
+        paddletensor32 = PaddleTensor(tensor32)
+        dtype32 = paddletensor32.dtype
+        self.assertEqual(dtype32, PaddleDType.INT32)
+        self.assertEqual(
+            paddletensor32.data.tolist('int32'), tensor32.ravel().tolist()
+        )
+        paddletensor32.data.reset(tensor32)
+        self.assertEqual(
+            paddletensor32.as_ndarray().ravel().tolist(),
+            tensor32.ravel().tolist(),
+        )
+
+        tensor64 = np.random.randint(10, 20, size=[20, 2]).astype('int64')
+        paddletensor64 = PaddleTensor(tensor64)
+        dtype64 = paddletensor64.dtype
+        self.assertEqual(dtype64, PaddleDType.INT64)
+        self.assertEqual(
+            paddletensor64.data.tolist('int64'), tensor64.ravel().tolist()
+        )
+        paddletensor64.data.reset(tensor64)
+        self.assertEqual(
+            paddletensor64.as_ndarray().ravel().tolist(),
+            tensor64.ravel().tolist(),
+        )
+
+        tensor_float = np.random.randn(20, 2).astype('float32')
+        paddletensor_float = PaddleTensor(tensor_float)
+        dtype_float = paddletensor_float.dtype
+        self.assertEqual(dtype_float, PaddleDType.FLOAT32)
+        self.assertEqual(
+            paddletensor_float.data.tolist('float32'),
+            tensor_float.ravel().tolist(),
+        )
+        paddletensor_float.data.reset(tensor_float)
+        self.assertEqual(
+            paddletensor_float.as_ndarray().ravel().tolist(),
+            tensor_float.ravel().tolist(),
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/legacy_test/test_inference_model_io.py b/test/legacy_test/test_inference_model_io.py
new file mode 100644
index 0000000000000..6b28a41bc4b08
--- /dev/null
+++ b/test/legacy_test/test_inference_model_io.py
@@ -0,0 +1,45 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle
+from paddle.base import core, executor
+from paddle.distributed.io import (
+    load_inference_model_distributed,
+)
+from paddle.pir_utils import test_with_pir_api
+from paddle.static.io import load_inference_model
+
+paddle.enable_static()
+
+
+class TestLoadInferenceModelError(unittest.TestCase):
+    @test_with_pir_api
+    def test_load_model_not_exist(self):
+        place = core.CPUPlace()
+        exe = executor.Executor(place)
+        self.assertRaises(
+            ValueError, load_inference_model, './test_not_exist_dir/model', exe
+        )
+        self.assertRaises(
+            ValueError,
+            load_inference_model_distributed,
+            './test_not_exist_dir',
+            exe,
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_input_spec.py b/test/legacy_test/test_input_spec.py
similarity index 96%
rename from test/deprecated/legacy_test/test_input_spec.py
rename to test/legacy_test/test_input_spec.py
index 8f86d002da306..aa649b58ca2a8 100644
--- a/test/deprecated/legacy_test/test_input_spec.py
+++ b/test/legacy_test/test_input_spec.py
@@ -35,9 +35,17 @@ def test_default(self):
         self.assertIsNone(tensor_spec.name)
 
     def test_from_tensor(self):
-        x_bool = paddle.tensor.fill_constant(
-            shape=[1], dtype='bool', value=True
-        )
+        if paddle.framework.use_pir_api():
+            x_bool = paddle.pir.core.create_parameter(
+                dtype='float32',
+                shape=[1],
+                name='xx',
+                initializer=paddle.nn.initializer.Uniform(),
+            )
+        else:
+            x_bool = paddle.tensor.fill_constant(
+                shape=[1], dtype='bool', value=True
+            )
         bool_spec = InputSpec.from_tensor(x_bool)
         self.assertEqual(bool_spec.dtype, x_bool.dtype)
         self.assertEqual(list(bool_spec.shape), list(x_bool.shape))
diff --git a/test/deprecated/legacy_test/test_instance_norm_op_v2.py b/test/legacy_test/test_instance_norm_op_v2.py
similarity index 100%
rename from test/deprecated/legacy_test/test_instance_norm_op_v2.py
rename to test/legacy_test/test_instance_norm_op_v2.py
diff --git a/test/deprecated/legacy_test/test_is_integer.py b/test/legacy_test/test_is_integer.py
similarity index 100%
rename from test/deprecated/legacy_test/test_is_integer.py
rename to test/legacy_test/test_is_integer.py
diff --git a/test/legacy_test/test_isin.py b/test/legacy_test/test_isin.py
new file mode 100644
index 0000000000000..101d89b4de84f
--- /dev/null
+++ b/test/legacy_test/test_isin.py
@@ -0,0 +1,327 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from op_test import convert_float_to_uint16
+
+import paddle
+from paddle import base
+from paddle.base import core
+from paddle.pir_utils import test_with_pir_api
+
+DATA_CASES = [
+    {'x_data': np.array(1.0), 'test_x_data': np.array(-1.0)},
+    {
+        'x_data': np.random.randint(-10, 10, (4, 8)),
+        'test_x_data': np.random.randint(0, 20, (2, 3)),
+    },
+    {
+        'x_data': np.random.randint(-50, 50, (8, 64)),
+        'test_x_data': np.random.randint(-20, 0, (4, 256)),
+    },
+]
+
+DATA_CASES_UNIQUE = [
+    {
+        'x_data': np.arange(0, 1000).reshape([2, 5, 100]),
+        'test_x_data': np.arange(200, 700),
+    },
+    {
+        'x_data': np.arange(-100, 100).reshape([2, 2, 5, 10]),
+        'test_x_data': np.arange(50, 150).reshape([4, 5, 5]),
+    },
+]
+
+DATA_CASES_BF16 = [
+    {'x_data': np.array(1.0), 'test_x_data': np.array(0.0)},
+    {
+        'x_data': np.random.randint(0, 10, (4, 8)),
+        'test_x_data': np.random.randint(5, 15, (2, 3)),
+    },
+    {
+        'x_data': np.random.randint(0, 50, (8, 64)),
+        'test_x_data': np.random.randint(0, 20, (4, 256)),
+    },
+]
+
+
+DATA_CASES_UNIQUE_BF16 = [
+    {
+        'x_data': np.arange(0, 100).reshape([2, 5, 10]),
+        'test_x_data': np.arange(50, 150),
+    },
+]
+
+
+DATA_TYPE = ['float32', 'float64', 'int32', 'int64']
+
+
+def run_dygraph(
+    x_data,
+    test_x_data,
+    type,
+    assume_unique=False,
+    invert=False,
+    use_gpu=False,
+):
+    place = paddle.CPUPlace()
+    if use_gpu and base.core.is_compiled_with_cuda():
+        place = paddle.CUDAPlace(0)
+    paddle.disable_static(place)
+    x_data = x_data.astype(type)
+    test_x_data = test_x_data.astype(type)
+    x_e = paddle.to_tensor(x_data)
+    x_t = paddle.to_tensor(test_x_data)
+    return paddle.isin(x_e, x_t, assume_unique, invert)
+
+
+def run_static(
+    x_data,
+    test_x_data,
+    type,
+    assume_unique=False,
+    invert=False,
+    use_gpu=False,
+):
+    paddle.enable_static()
+    startup_program = paddle.static.Program()
+    main_program = paddle.static.Program()
+    place = paddle.CPUPlace()
+    if use_gpu and base.core.is_compiled_with_cuda():
+        place = paddle.CUDAPlace(0)
+    exe = base.Executor(place)
+    with paddle.static.program_guard(main_program, startup_program):
+        x_data = x_data.astype(type)
+        test_x_data = test_x_data.astype(type)
+        x_e = paddle.static.data(name='x_e', shape=x_data.shape, dtype=type)
+        x_t = paddle.static.data(
+            name='x_t', shape=test_x_data.shape, dtype=type
+        )
+        res = paddle.isin(x_e, x_t, assume_unique, invert)
+        static_result = exe.run(
+            feed={'x_e': x_data, 'x_t': test_x_data},
+            fetch_list=[res],
+        )
+        return static_result
+
+
+def test(
+    data_cases, type_cases, assume_unique=False, invert=False, use_gpu=False
+):
+    for type in type_cases:
+        for case in data_cases:
+            x_data = case['x_data']
+            test_x_data = case['test_x_data']
+            dygraph_result = run_dygraph(
+                x_data,
+                test_x_data,
+                type,
+                assume_unique,
+                invert,
+                use_gpu,
+            ).numpy()
+            np_result = np.isin(
+                x_data.astype(type),
+                test_x_data.astype(type),
+                assume_unique=assume_unique,
+                invert=invert,
+            )
+            np.testing.assert_equal(dygraph_result, np_result)
+
+            @test_with_pir_api
+            def test_static():
+                (static_result,) = run_static(
+                    x_data,
+                    test_x_data,
+                    type,
+                    assume_unique,
+                    invert,
+                    use_gpu,
+                )
+                np.testing.assert_equal(static_result, np_result)
+
+            test_static()
+
+
+def run_dygraph_bf16(
+    x_data,
+    test_x_data,
+    assume_unique=False,
+    invert=False,
+    use_gpu=False,
+):
+    place = paddle.CPUPlace()
+    if use_gpu and base.core.is_compiled_with_cuda():
+        place = paddle.CUDAPlace(0)
+    paddle.disable_static(place)
+    x_e = paddle.to_tensor(convert_float_to_uint16(x_data))
+    x_t = paddle.to_tensor(convert_float_to_uint16(test_x_data))
+    return paddle.isin(x_e, x_t, assume_unique, invert)
+
+
+def run_static_bf16(
+    x_data,
+    test_x_data,
+    assume_unique=False,
+    invert=False,
+    use_gpu=False,
+):
+    paddle.enable_static()
+    startup_program = paddle.static.Program()
+    main_program = paddle.static.Program()
+    place = paddle.CPUPlace()
+    if use_gpu and base.core.is_compiled_with_cuda():
+        place = paddle.CUDAPlace(0)
+    exe = base.Executor(place)
+    with paddle.static.program_guard(main_program, startup_program):
+        x_data = convert_float_to_uint16(x_data)
+        test_x_data = convert_float_to_uint16(test_x_data)
+        x_e = paddle.static.data(
+            name='x_e', shape=x_data.shape, dtype=np.uint16
+        )
+        x_t = paddle.static.data(
+            name='x_t', shape=test_x_data.shape, dtype=np.uint16
+        )
+        res = paddle.isin(x_e, x_t, assume_unique, invert)
+        static_result = exe.run(
+            feed={'x_e': x_data, 'x_t': test_x_data},
+            fetch_list=[res],
+        )
+        return static_result
+
+
+def test_bf16(data_cases, assume_unique=False, invert=False, use_gpu=False):
+    for case in data_cases:
+        x_data = case['x_data'].astype("float32")
+        test_x_data = case['test_x_data'].astype("float32")
+        dygraph_result = run_dygraph_bf16(
+            x_data,
+            test_x_data,
+            assume_unique,
+            invert,
+            use_gpu,
+        ).numpy()
+        np_result = np.isin(
+            x_data,
+            test_x_data,
+            assume_unique=assume_unique,
+            invert=invert,
+        )
+        np.testing.assert_equal(dygraph_result, np_result)
+
+        @test_with_pir_api
+        def test_static():
+            (static_result,) = run_static_bf16(
+                x_data,
+                test_x_data,
+                assume_unique,
+                invert,
+                use_gpu,
+            )
+            np.testing.assert_equal(static_result, np_result)
+
+        test_static()
+
+
+class TestIsInError(unittest.TestCase):
+    def test_for_exception(self):
+        with self.assertRaises(TypeError):
+            paddle.isin(np.array([1, 2]), np.array([1, 2]))
+
+
+class TestIsIn(unittest.TestCase):
+    def test_without_gpu(self):
+        test(DATA_CASES, DATA_TYPE)
+
+    def test_with_gpu(self):
+        test(DATA_CASES, DATA_TYPE, use_gpu=True)
+
+    def test_invert_without_gpu(self):
+        test(DATA_CASES, DATA_TYPE, invert=True)
+
+    def test_invert_with_gpu(self):
+        test(DATA_CASES, DATA_TYPE, invert=True, use_gpu=True)
+
+    def test_unique_without_gpu(self):
+        test(DATA_CASES_UNIQUE, DATA_TYPE, assume_unique=True)
+
+    def test_unique_with_gpu(self):
+        test(DATA_CASES_UNIQUE, DATA_TYPE, assume_unique=True, use_gpu=True)
+
+    def test_unique_invert_without_gpu(self):
+        test(DATA_CASES_UNIQUE, DATA_TYPE, assume_unique=True, invert=True)
+
+    def test_unique_invert_with_gpu(self):
+        test(
+            DATA_CASES_UNIQUE,
+            DATA_TYPE,
+            assume_unique=True,
+            invert=True,
+            use_gpu=True,
+        )
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_float16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and not support the float16",
+)
+class TestIsInFP16(unittest.TestCase):
+    def test_default(self):
+        test(DATA_CASES, ['float16'], use_gpu=True)
+
+    def test_invert(self):
+        test(DATA_CASES, ['float16'], invert=True, use_gpu=True)
+
+    def test_unique(self):
+        test(DATA_CASES_UNIQUE, ['float16'], assume_unique=True, use_gpu=True)
+
+    def test_unique_invert(self):
+        test(
+            DATA_CASES_UNIQUE,
+            ['float16'],
+            assume_unique=True,
+            invert=True,
+            use_gpu=True,
+        )
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_float16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and not support the float16",
+)
+class TestIsInBF16(unittest.TestCase):
+    def test_default(self):
+        test_bf16(DATA_CASES_BF16, use_gpu=True)
+
+    def test_invert(self):
+        test_bf16(DATA_CASES_BF16, invert=True, use_gpu=True)
+
+    def test_unique(self):
+        test_bf16(DATA_CASES_UNIQUE_BF16, assume_unique=True, use_gpu=True)
+
+    def test_unique_invert(self):
+        test_bf16(
+            DATA_CASES_UNIQUE_BF16,
+            assume_unique=True,
+            invert=True,
+            use_gpu=True,
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/legacy_test/test_jit_save_load.py b/test/legacy_test/test_jit_save_load.py
index 09f5a7b9a4e4b..04b86c6864685 100644
--- a/test/legacy_test/test_jit_save_load.py
+++ b/test/legacy_test/test_jit_save_load.py
@@ -329,7 +329,6 @@ def train(layer, input_size=784, label_size=1):
     for data in train_loader():
         img, label = data
         label.stop_gradient = True
-
         cost = layer(img)
 
         loss = paddle.nn.functional.cross_entropy(
@@ -396,6 +395,8 @@ def train_and_save_model(self, model_path=None):
     @test_with_dygraph_pir
     def test_save_load(self):
         # train and save model
+        if not paddle.framework.use_pir_api():
+            return
         train_layer = self.train_and_save_model()
         # load model
         loaded_layer = paddle.jit.load(self.model_path)
@@ -496,6 +497,7 @@ def setUp(self):
     def tearDown(self):
         self.temp_dir.cleanup()
 
+    @test_with_dygraph_pir
     def test_output_same_order(self):
         x = paddle.to_tensor(np.random.random((4, 8)).astype('float32'))
 
@@ -1712,6 +1714,7 @@ def setUp(self):
     def tearDown(self):
         self.temp_dir.cleanup()
 
+    @test_with_dygraph_pir
     def test_save_load_finetune_load(self):
         model_path = os.path.join(
             self.temp_dir.name, "test_jit_save_load_save_without_running/model"
@@ -1788,7 +1791,6 @@ def forward(self, x):
         return y
 
 
-'''
 class TestJitSaveLoadFinetuneLoad(unittest.TestCase):
     def setUp(self):
         # enable dygraph mode
@@ -1798,8 +1800,10 @@ def setUp(self):
     def tearDown(self):
         self.temp_dir.cleanup()
 
-    #@test_with_dygraph_pir
+    @test_with_dygraph_pir
     def test_save_load_finetune_load(self):
+        if not paddle.framework.use_pir_api():
+            return
         model_path = os.path.join(
             self.temp_dir.name, "test_jit_save_load_finetune_load/model"
         )
@@ -1830,7 +1834,6 @@ def test_save_load_finetune_load(self):
 
         self.assertTrue(float((result_00 - result_10).abs().max()) < 1e-5)
         self.assertTrue(float((result_01 - result_11).abs().max()) < 1e-5)
-'''
 
 
 # NOTE(weixin): When there are multiple test functions in an
diff --git a/test/deprecated/legacy_test/test_kldiv_loss_op.py b/test/legacy_test/test_kldiv_loss_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_kldiv_loss_op.py
rename to test/legacy_test/test_kldiv_loss_op.py
diff --git a/test/deprecated/legacy_test/test_kron_op.py b/test/legacy_test/test_kron_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_kron_op.py
rename to test/legacy_test/test_kron_op.py
diff --git a/test/deprecated/legacy_test/test_kthvalue_op.py b/test/legacy_test/test_kthvalue_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_kthvalue_op.py
rename to test/legacy_test/test_kthvalue_op.py
diff --git a/test/deprecated/legacy_test/test_l1_norm_op.py b/test/legacy_test/test_l1_norm_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_l1_norm_op.py
rename to test/legacy_test/test_l1_norm_op.py
diff --git a/test/deprecated/legacy_test/test_label_smooth_op.py b/test/legacy_test/test_label_smooth_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_label_smooth_op.py
rename to test/legacy_test/test_label_smooth_op.py
diff --git a/test/deprecated/legacy_test/test_layer_norm_op.py b/test/legacy_test/test_layer_norm_op.py
similarity index 73%
rename from test/deprecated/legacy_test/test_layer_norm_op.py
rename to test/legacy_test/test_layer_norm_op.py
index 29e129781bfe0..2fd1eb2b1a747 100644
--- a/test/deprecated/legacy_test/test_layer_norm_op.py
+++ b/test/legacy_test/test_layer_norm_op.py
@@ -21,7 +21,6 @@
 
 import paddle
 import paddle.nn.functional as F
-from paddle import base
 from paddle.base import Program, core, program_guard
 from paddle.static.amp.fp16_utils import _keep_layer_norm_scale_bias_to_fp32
 
@@ -580,268 +579,6 @@ def initConfig(self):
         self.check_pir = True
 
 
-class TestLayerNormOp(unittest.TestCase):
-    def setUp(self):
-        self.use_cudnn = True
-        paddle.enable_static()
-
-    def __assert_close(self, tensor, np_array, msg, atol=1e-4):
-        np.testing.assert_allclose(
-            np.array(tensor).flatten(),
-            np_array.flatten(),
-            rtol=1e-3,
-            atol=atol,
-            err_msg=msg,
-        )
-
-    def check_forward_backward(
-        self,
-        shape,
-        begin_norm_axis,
-        has_scale=True,
-        has_bias=True,
-        y_grad_scale=1.0,
-        use_mkldnn=False,
-    ):
-        def test_with_place(
-            place, shape, begin_norm_axis, use_mkldnn=use_mkldnn
-        ):
-            # attr
-            epsilon = 0.00001
-            x_shape = shape
-            D = reduce(mul, x_shape[begin_norm_axis : len(x_shape)], 1)
-            scale_shape = [D]
-
-            np.random.seed(123)
-            x = np.random.random_sample(x_shape).astype(np.float32)
-            scale = (
-                np.random.random_sample(scale_shape).astype(np.float32)
-                if has_scale
-                else None
-            )
-            bias = (
-                np.random.random_sample(scale_shape).astype(np.float32)
-                if has_bias
-                else None
-            )
-            y_grad = (np.random.random_sample(x_shape) * y_grad_scale).astype(
-                np.float32
-            )
-
-            # reference forward & backward
-            y, mean, variance = _reference_layer_norm_naive(
-                x, scale, bias, epsilon, begin_norm_axis
-            )
-            x_grad, scale_grad, bias_grad = _reference_layer_norm_grad(
-                x, y_grad, scale, bias, mean, variance, begin_norm_axis
-            )
-
-            var_dict = locals()
-            var_dict['y@GRAD'] = y_grad
-            var_names = ['x', 'mean', 'variance', 'y', 'y@GRAD']
-            if has_scale:
-                var_names += ['scale']
-            if has_bias:
-                var_names += ['bias']
-            ground_truth = {name: var_dict[name] for name in var_names}
-
-            program = base.Program()
-            with base.program_guard(program):
-                block = program.global_block()
-                for name in ground_truth:
-                    block.create_var(
-                        name=name,
-                        dtype='float32',
-                        shape=ground_truth[name].shape,
-                    )
-                inputs = {"X": block.var('x')}
-                fetch_list = [
-                    'y',
-                    'mean',
-                    'variance',
-                    'x@GRAD',
-                ]
-                if has_scale:
-                    inputs["Scale"] = block.var('scale')
-                    fetch_list += ['scale@GRAD']
-                if has_bias:
-                    inputs["Bias"] = block.var('bias')
-                    fetch_list += ['bias@GRAD']
-                layer_norm_op = block.append_op(
-                    type="layer_norm",
-                    inputs=inputs,
-                    outputs={
-                        "Y": block.var('y'),
-                        "Mean": block.var('mean'),  # share the same memory
-                        "Variance": block.var(
-                            'variance'
-                        ),  # share the same memory
-                    },
-                    attrs={
-                        "epsilon": epsilon,
-                        "begin_norm_axis": begin_norm_axis,
-                        "use_mkldnn": use_mkldnn,
-                    },
-                )
-                # generate backward op_desc
-                grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
-                    layer_norm_op.desc, set(), []
-                )
-                grad_op_desc = grad_op_desc_list[0]
-                new_op_desc = block.desc.append_op()
-                new_op_desc.copy_from(grad_op_desc)
-                for var_name in grad_op_desc.output_arg_names():
-                    block.desc.var(var_name.encode("ascii"))
-                grad_op_desc.infer_var_type(block.desc)
-                grad_op_desc.infer_shape(block.desc)
-                for arg in grad_op_desc.output_arg_names():
-                    grad_var = block.desc.find_var(arg.encode("ascii"))
-                    grad_var.set_dtype(core.VarDesc.VarType.FP32)
-
-                program._sync_with_cpp()
-                exe = base.Executor(place)
-                name_list = ['x', 'y@GRAD']
-                if has_scale:
-                    name_list += ['scale']
-                if has_bias:
-                    name_list += ['bias']
-
-                out = exe.run(
-                    program,
-                    feed={name: var_dict[name] for name in name_list},
-                    fetch_list=fetch_list,
-                )
-                # print(y)
-                # print(out[0])
-                self.__assert_close(y, out[0], "y")
-                self.__assert_close(mean, out[1], "mean")
-                self.__assert_close(variance, out[2], "variance", 1e-3)
-                self.__assert_close(x_grad, out[3], "x_grad")
-                if has_scale:
-                    self.__assert_close(
-                        scale_grad,
-                        out[fetch_list.index('scale@GRAD')],
-                        "scale_grad",
-                        1e-3,
-                    )
-                if has_bias:
-                    self.__assert_close(
-                        bias_grad,
-                        out[fetch_list.index('bias@GRAD')],
-                        "bias_grad",
-                    )
-
-        places = [core.CPUPlace()]
-        if (
-            core.is_compiled_with_cuda()
-            and core.op_support_gpu("layer_norm")
-            and self.use_cudnn
-        ):
-            places.append(core.CUDAPlace(0))
-
-        for place in places:
-            test_with_place(place, shape, begin_norm_axis)
-
-    def test_check_forward_backward_with_scale_and_bias(self):
-        self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=1)
-        self.check_forward_backward(shape=[1, 3, 4, 5], begin_norm_axis=1)
-        self.check_forward_backward(
-            shape=[2, 3, 4, 5],
-            begin_norm_axis=1,
-            has_scale=False,
-            has_bias=True,
-        )
-        self.check_forward_backward(
-            shape=[2, 3, 4, 5],
-            begin_norm_axis=1,
-            has_scale=True,
-            has_bias=False,
-        )
-        self.check_forward_backward(
-            shape=[2, 3, 4, 5],
-            begin_norm_axis=1,
-            has_scale=False,
-            has_bias=False,
-        )
-        self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=3)
-        self.check_forward_backward(
-            shape=[92, 513, 129], begin_norm_axis=2, y_grad_scale=0.1
-        )
-        self.check_forward_backward(shape=[3, 34, 1134], begin_norm_axis=2)
-        self.check_forward_backward(shape=[3, 2, 1133], begin_norm_axis=2)
-        self.check_forward_backward(
-            shape=[92, 513, 1134], begin_norm_axis=2, y_grad_scale=0.1
-        )
-        self.check_forward_backward(
-            shape=[92, 513, 1134],
-            begin_norm_axis=2,
-            has_scale=False,
-            has_bias=True,
-            y_grad_scale=0.1,
-        )
-        self.check_forward_backward(
-            shape=[92, 513, 1134],
-            begin_norm_axis=2,
-            has_scale=True,
-            has_bias=False,
-            y_grad_scale=0.1,
-        )
-        self.check_forward_backward(
-            shape=[92, 513, 1134],
-            begin_norm_axis=2,
-            has_scale=False,
-            has_bias=False,
-            y_grad_scale=0.1,
-        )
-        self.check_forward_backward(
-            shape=[512, 1024], begin_norm_axis=1, has_scale=True, has_bias=True
-        )
-        self.check_forward_backward(
-            shape=[1, 128, 256, 256],
-            begin_norm_axis=3,
-            has_scale=True,
-            has_bias=True,
-        )
-        self.check_forward_backward(
-            shape=[1, 256, 384],
-            begin_norm_axis=2,
-            has_scale=True,
-            has_bias=True,
-        )
-
-
-class TestLayerNormAPI(unittest.TestCase):
-    def test_case(self):
-        x = paddle.static.data(name='x', shape=[64, 32, 256], dtype='float32')
-        x = paddle.static.nn.layer_norm(
-            x,
-            scale=True,
-            shift=True,
-            begin_norm_axis=1,
-            epsilon=1e-05,
-            param_attr=None,
-            bias_attr=None,
-        )
-        x = paddle.static.nn.layer_norm(
-            x,
-            scale=False,
-            shift=False,
-            begin_norm_axis=1,
-            epsilon=1e-05,
-            param_attr=None,
-            bias_attr=None,
-        )
-        x = paddle.static.nn.layer_norm(
-            x,
-            scale=True,
-            shift=True,
-            begin_norm_axis=1,
-            epsilon=1e-05,
-            param_attr="scale",
-            bias_attr="shift",
-        )
-
-
 class TestDygraphLayerNormAPIError(unittest.TestCase):
     def test_errors(self):
         with program_guard(Program(), Program()):
diff --git a/test/deprecated/legacy_test/test_lerp_op.py b/test/legacy_test/test_lerp_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_lerp_op.py
rename to test/legacy_test/test_lerp_op.py
diff --git a/test/deprecated/legacy_test/test_lgamma_op.py b/test/legacy_test/test_lgamma_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_lgamma_op.py
rename to test/legacy_test/test_lgamma_op.py
diff --git a/test/deprecated/legacy_test/test_linear_interp_op.py b/test/legacy_test/test_linear_interp_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_linear_interp_op.py
rename to test/legacy_test/test_linear_interp_op.py
diff --git a/test/deprecated/legacy_test/test_linear_interp_v2_op.py b/test/legacy_test/test_linear_interp_v2_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_linear_interp_v2_op.py
rename to test/legacy_test/test_linear_interp_v2_op.py
diff --git a/test/legacy_test/test_listen_and_serv.sh b/test/legacy_test/test_listen_and_serv.sh
index d9d64e4dfa693..62cf4c359f0b1 100644
--- a/test/legacy_test/test_listen_and_serv.sh
+++ b/test/legacy_test/test_listen_and_serv.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,7 +22,7 @@ pid=$!
 flag1=test_handle_signal_in_serv_op.flag
 flag2=test_list_and_serv_run_empty_optimize_block.flag
 
-for i in {1..10}; do 
+for i in {1..10}; do
     sleep 6s
     if [[ -f "${flag1}" && -f "${flag2}" ]];  then
         echo "test_listen_and_serv_op exit"
@@ -34,8 +34,8 @@ echo "test_listen_and_serv_op.log context"
 cat test_listen_and_serv_op.log
 
 #display system context
-for i in {1..4}; do 
-    sleep 2 
+for i in {1..4}; do
+    sleep 2
     top -b -n1  | head -n 50
     echo "${i}"
     top -b -n1 -i  | head -n 50
@@ -54,8 +54,8 @@ kill -9 $pid
 echo "after kill ${pid}"
 
 #display system context
-for i in {1..4}; do 
-    sleep 2 
+for i in {1..4}; do
+    sleep 2
     top -b -n1  | head -n 50
     top -b -n1 -i  | head -n 50
     nvidia-smi
diff --git a/test/deprecated/legacy_test/test_load_state_dict_from_old_format.py b/test/legacy_test/test_load_state_dict_from_old_format.py
similarity index 100%
rename from test/deprecated/legacy_test/test_load_state_dict_from_old_format.py
rename to test/legacy_test/test_load_state_dict_from_old_format.py
diff --git a/test/deprecated/legacy_test/test_log_loss_op.py b/test/legacy_test/test_log_loss_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_log_loss_op.py
rename to test/legacy_test/test_log_loss_op.py
diff --git a/test/deprecated/legacy_test/test_log_softmax.py b/test/legacy_test/test_log_softmax.py
similarity index 100%
rename from test/deprecated/legacy_test/test_log_softmax.py
rename to test/legacy_test/test_log_softmax.py
diff --git a/test/deprecated/legacy_test/test_logsumexp.py b/test/legacy_test/test_logsumexp.py
similarity index 100%
rename from test/deprecated/legacy_test/test_logsumexp.py
rename to test/legacy_test/test_logsumexp.py
diff --git a/test/deprecated/legacy_test/test_lr_scheduler.py b/test/legacy_test/test_lr_scheduler.py
similarity index 100%
rename from test/deprecated/legacy_test/test_lr_scheduler.py
rename to test/legacy_test/test_lr_scheduler.py
diff --git a/test/deprecated/legacy_test/test_lrn_op.py b/test/legacy_test/test_lrn_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_lrn_op.py
rename to test/legacy_test/test_lrn_op.py
diff --git a/test/legacy_test/test_lstm_cudnn_op.py b/test/legacy_test/test_lstm_cudnn_op.py
index ade1f61c0d5a9..3362297747b63 100644
--- a/test/legacy_test/test_lstm_cudnn_op.py
+++ b/test/legacy_test/test_lstm_cudnn_op.py
@@ -35,7 +35,7 @@ class RandomWeight:
     def __init__(self):
         pass
 
-    def updata_weight(self, hidden_size, input_size, dtype):
+    def update_weight(self, hidden_size, input_size, dtype):
         std = 1.0 / math.sqrt(hidden_size)
         self.hidden_size = hidden_size
         self.input_size = input_size
@@ -432,7 +432,7 @@ def setUp(self):
         input[9][3:][:] = 0
         input[8][4:][:] = 0
 
-        weight.updata_weight(hidden_size, input_size, self.dtype)
+        weight.update_weight(hidden_size, input_size, self.dtype)
         rnn1 = LSTM(
             input_size,
             hidden_size,
diff --git a/test/deprecated/legacy_test/test_lstm_op.py b/test/legacy_test/test_lstm_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_lstm_op.py
rename to test/legacy_test/test_lstm_op.py
diff --git a/test/deprecated/legacy_test/test_lu_unpack_op.py b/test/legacy_test/test_lu_unpack_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_lu_unpack_op.py
rename to test/legacy_test/test_lu_unpack_op.py
diff --git a/test/deprecated/legacy_test/test_masked_scatter.py b/test/legacy_test/test_masked_scatter.py
similarity index 100%
rename from test/deprecated/legacy_test/test_masked_scatter.py
rename to test/legacy_test/test_masked_scatter.py
diff --git a/test/deprecated/legacy_test/test_matmul_op.py b/test/legacy_test/test_matmul_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_matmul_op.py
rename to test/legacy_test/test_matmul_op.py
diff --git a/test/deprecated/legacy_test/test_matmul_v2_op.py b/test/legacy_test/test_matmul_v2_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_matmul_v2_op.py
rename to test/legacy_test/test_matmul_v2_op.py
diff --git a/test/deprecated/legacy_test/test_maxout_op.py b/test/legacy_test/test_maxout_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_maxout_op.py
rename to test/legacy_test/test_maxout_op.py
diff --git a/test/deprecated/legacy_test/test_meshgrid_op.py b/test/legacy_test/test_meshgrid_op.py
similarity index 80%
rename from test/deprecated/legacy_test/test_meshgrid_op.py
rename to test/legacy_test/test_meshgrid_op.py
index b72f51cd04144..869e2c4e88281 100644
--- a/test/deprecated/legacy_test/test_meshgrid_op.py
+++ b/test/legacy_test/test_meshgrid_op.py
@@ -42,16 +42,28 @@ def init_data_type(self):
         self.dtype = np.float64
 
     def test_check_output(self):
-        self.check_output(check_prim=True, check_pir=True, check_prim_pir=True)
+        if self.dtype == np.complex64 or self.dtype == np.complex128:
+            self.check_output(check_pir=True)
+        else:
+            self.check_output(
+                check_prim=True, check_pir=True, check_prim_pir=True
+            )
 
     def test_check_grad(self):
-        self.check_grad(
-            ['x0'],
-            ['out0', 'out1'],
-            check_prim=True,
-            check_pir=True,
-            check_prim_pir=True,
-        )
+        if self.dtype == np.complex64 or self.dtype == np.complex128:
+            self.check_grad(
+                ['x0'],
+                ['out0', 'out1'],
+                check_pir=True,
+            )
+        else:
+            self.check_grad(
+                ['x0'],
+                ['out0', 'out1'],
+                check_prim=True,
+                check_pir=True,
+                check_prim_pir=True,
+            )
 
     def init_inputs_and_outputs(self):
         self.shape = self.get_x_shape()
@@ -91,6 +103,22 @@ def init_data_type(self):
         self.dtype = np.float16
 
 
+class TestMeshgridOp2Complex64(TestMeshgridOp):
+    def get_x_shape(self):
+        return [100, 300]
+
+    def init_data_type(self):
+        self.dtype = np.complex64
+
+
+class TestMeshgridOp2Complex128(TestMeshgridOp):
+    def get_x_shape(self):
+        return [100, 300]
+
+    def init_data_type(self):
+        self.dtype = np.complex128
+
+
 @unittest.skipIf(
     not core.is_compiled_with_cuda()
     or not core.is_bfloat16_supported(core.CUDAPlace(0)),
@@ -336,6 +364,70 @@ def test_api_with_dygraph_tuple_input(self):
             np.testing.assert_array_equal(res_4.shape, [100, 200])
 
 
+class TestMeshgridOpComplexStatic(unittest.TestCase):
+    @test_with_pir_api
+    def test_tuple_input(self):
+        input_1 = np.random.randint(
+            0,
+            100,
+            [
+                100,
+            ],
+        ).astype('complex64')
+        input_2 = np.random.randint(
+            0,
+            100,
+            [
+                200,
+            ],
+        ).astype('complex64')
+
+        out_1 = np.reshape(input_1, [100, 1])
+        out_1 = np.broadcast_to(out_1, [100, 200])
+        out_2 = np.reshape(input_2, [1, 200])
+        out_2 = np.broadcast_to(out_2, [100, 200])
+
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.static.data(shape=[100], dtype='complex64', name='x')
+            y = paddle.static.data(shape=[200], dtype='complex64', name='y')
+
+            exe = base.Executor(place=base.CPUPlace())
+            grid_x, grid_y = paddle.tensor.meshgrid((x, y))
+            res_1, res_2 = exe.run(
+                paddle.static.default_main_program(),
+                feed={'x': input_1, 'y': input_2},
+                fetch_list=[grid_x, grid_y],
+            )
+        np.testing.assert_array_equal(res_1, out_1)
+        np.testing.assert_array_equal(res_2, out_2)
+
+
+class TestMeshgridOpComplexDygraph(unittest.TestCase):
+    def test_api_with_dygraph_tuple_input(self):
+        input_3 = np.random.randint(
+            0,
+            100,
+            [
+                100,
+            ],
+        ).astype('complex64')
+        input_4 = np.random.randint(
+            0,
+            100,
+            [
+                200,
+            ],
+        ).astype('complex64')
+
+        with base.dygraph.guard():
+            tensor_3 = paddle.to_tensor(input_3)
+            tensor_4 = paddle.to_tensor(input_4)
+            res_3, res_4 = paddle.tensor.meshgrid((tensor_3, tensor_4))
+
+            np.testing.assert_array_equal(res_3.shape, [100, 200])
+            np.testing.assert_array_equal(res_4.shape, [100, 200])
+
+
 class TestMeshGrid_ZeroDim(TestMeshgridOp):
     def init_inputs_and_outputs(self):
         self.shape = self.get_x_shape()
diff --git a/test/deprecated/legacy_test/test_modified_huber_loss_op.py b/test/legacy_test/test_modified_huber_loss_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_modified_huber_loss_op.py
rename to test/legacy_test/test_modified_huber_loss_op.py
diff --git a/test/deprecated/legacy_test/test_mul_op.py b/test/legacy_test/test_mul_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_mul_op.py
rename to test/legacy_test/test_mul_op.py
diff --git a/test/deprecated/legacy_test/test_multi_dot_op.py b/test/legacy_test/test_multi_dot_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_multi_dot_op.py
rename to test/legacy_test/test_multi_dot_op.py
diff --git a/test/deprecated/legacy_test/test_mv_op.py b/test/legacy_test/test_mv_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_mv_op.py
rename to test/legacy_test/test_mv_op.py
diff --git a/test/legacy_test/test_nce.py b/test/legacy_test/test_nce.py
new file mode 100644
index 0000000000000..c8a57ee5be488
--- /dev/null
+++ b/test/legacy_test/test_nce.py
@@ -0,0 +1,154 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from op_test import OpTest
+
+
+def nce(
+    input, weight, bias, sample_weight, labels, num_classes, num_sample_class
+):
+    samples = []
+    sample_labels = []
+    batch_size = input.shape[0]
+    num_true_class = labels.shape[1]
+    for i in range(batch_size):
+        w = 1 if sample_weight is None else sample_weight[i]
+        for label in labels[i]:
+            samples.append((i, label, True, w))
+            sample_labels.append(label)
+        for num in range(num_sample_class):
+            samples.append((i, num, False, w))
+            sample_labels.append(num)
+    # forward bias
+    sample_out = np.zeros(len(samples)).astype(np.float32)
+    if bias is not None:
+        for i in range(len(samples)):
+            sample_out[i] = bias[samples[i][1]]
+    # forward weight
+    for i in range(len(samples)):
+        sample_out[i] += np.dot(input[samples[i][0]], weight[samples[i][1]])
+
+    # forward activation
+    sample_out = 1.0 / (1.0 + np.exp(-sample_out))
+    # forward cost
+    out = np.zeros(batch_size).astype(np.float32)
+    b = 1.0 / num_classes * num_sample_class
+    for i in range(len(samples)):
+        o = sample_out[i]
+        cost = -np.log(o / (o + b)) if samples[i][2] else -np.log(b / (o + b))
+        out[samples[i][0]] += cost * samples[i][3]
+    return (
+        out[:, np.newaxis],
+        np.array(sample_out).reshape(
+            batch_size, num_sample_class + num_true_class
+        ),
+        np.array(sample_labels).reshape(
+            batch_size, num_sample_class + num_true_class
+        ),
+    )
+
+
+class TestNCE(OpTest):
+    def generate_data(
+        self,
+        dim,
+        batch_size,
+        num_classes,
+        num_true_class,
+        num_neg_samples,
+        is_sparse,
+    ):
+        input = np.random.randn(batch_size, dim).astype(np.float32)
+        weight = np.random.randn(num_classes, dim).astype(np.float32)
+        bias = np.random.randn(num_classes).astype(np.float32)
+        sample_weight = np.random.randn(batch_size).astype(np.float32)
+        labels = np.random.randint(
+            0, num_classes, (batch_size, num_true_class)
+        ).astype("int64")
+        self.attrs = {
+            'num_total_classes': num_classes,
+            'num_neg_samples': num_neg_samples,
+            'custom_neg_classes': list(range(num_neg_samples)),
+            'seed': 0,
+            'sampler': 0,
+            'is_sparse': is_sparse,
+            'is_test': self.is_test,
+        }
+        self.inputs = {
+            'Input': input,
+            'Label': labels,
+            'Weight': weight,
+            'Bias': bias,
+            'SampleWeight': sample_weight,
+        }
+
+    def set_is_test(self):
+        self.is_test = False
+
+    def set_data(self):
+        self.generate_data(5, 25, 100, 1, 2, False)
+
+    def compute(self):
+        out = nce(
+            self.inputs['Input'],
+            self.inputs['Weight'],
+            self.inputs['Bias'],
+            self.inputs['SampleWeight'],
+            self.inputs['Label'],
+            self.attrs['num_total_classes'],
+            self.attrs['num_neg_samples'],
+        )
+        if self.is_test:
+            self.outputs = {'Cost': out[0]}
+        else:
+            self.outputs = {
+                'Cost': out[0],
+                'SampleLogits': out[1],
+                'SampleLabels': out[2],
+            }
+
+    def setUp(self):
+        self.op_type = 'nce'
+        self.set_is_test()
+        self.set_data()
+        self.compute()
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(
+            ["Input", "Weight", "Bias"], "Cost", max_relative_error=0.02
+        )
+
+
+class TestNCECase1Tensor(TestNCE):
+    def set_data(self):
+        self.generate_data(10, 20, 100, 2, 5, False)
+
+
+class TestNCETensorIsTest(TestNCE):
+    # if is_test = True, there's no need to calculate grad
+    def set_is_test(self):
+        self.is_test = True
+
+    def test_check_grad(self):
+        pass
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_nearest_interp_op.py b/test/legacy_test/test_nearest_interp_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_nearest_interp_op.py
rename to test/legacy_test/test_nearest_interp_op.py
diff --git a/test/deprecated/legacy_test/test_nearest_interp_v2_op.py b/test/legacy_test/test_nearest_interp_v2_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_nearest_interp_v2_op.py
rename to test/legacy_test/test_nearest_interp_v2_op.py
diff --git a/test/deprecated/legacy_test/test_ops_nms.py b/test/legacy_test/test_ops_nms.py
similarity index 100%
rename from test/deprecated/legacy_test/test_ops_nms.py
rename to test/legacy_test/test_ops_nms.py
diff --git a/test/legacy_test/test_optimizer.py b/test/legacy_test/test_optimizer.py
new file mode 100644
index 0000000000000..63273c2eb9928
--- /dev/null
+++ b/test/legacy_test/test_optimizer.py
@@ -0,0 +1,169 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+import unittest
+
+import numpy
+import numpy as np
+
+import paddle
+from paddle import base
+from paddle.base import core
+from paddle.base.framework import (
+    convert_np_dtype_to_dtype_,
+)
+from paddle.io import Dataset
+
+
+class TestOptimizerDtype(unittest.TestCase):
+    '''
+    The dtype of optimizer should be inferred by parameters, and the learning rate
+    is cteated with the same dtype.
+    '''
+
+    def check_with_dtype(self, dtype):
+        class MyLayer(paddle.nn.Layer):
+            def __init__(self, dtype):
+                super().__init__()
+                self._w = self.create_parameter([2, 3], dtype=dtype)
+                self._b = self.create_parameter([2, 3], dtype=dtype)
+
+            def forward(self, x):
+                return x * self._w + self._b
+
+        with paddle.base.dygraph.guard():
+            model = MyLayer(dtype)
+            x = paddle.rand([10, 2, 3], dtype=dtype)
+            loss = model(x)
+            adam = paddle.optimizer.Adam(parameters=model.parameters())
+            loss.backward()
+            adam.step()
+            self.assertEqual(adam._dtype, convert_np_dtype_to_dtype_(dtype))
+
+    def test_float64(self):
+        self.check_with_dtype('float64')
+
+    def test_float32(self):
+        self.check_with_dtype('float32')
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or paddle.device.cuda.get_device_capability()[0] < 7.0,
+    "run test when gpu's compute capability is at least 7.0.",
+)
+class TestMasterWeightSaveForFP16(unittest.TestCase):
+    '''
+    For Amp-O2, some optimizer(Momentum, Adam ...) will create master weights for parameters to improve the accuracy.
+    Master weights will be saved by optimizer::state_dict.
+    '''
+
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
+    def check_with_opt_state_dict(self, use_save_load=True):
+        paddle.seed(100)
+        numpy.random.seed(100)
+
+        class SimpleNet(paddle.nn.Layer):
+            def __init__(self, input_size, output_size):
+                super().__init__()
+                self.linears = paddle.nn.LayerList(
+                    [
+                        paddle.nn.Linear(input_size, output_size)
+                        for i in range(1)
+                    ]
+                )
+
+            def forward(self, x):
+                for i, l in enumerate(self.linears):
+                    x = self.linears[i](x)
+                return x
+
+        input_size = 2  # 设为较大的值
+        output_size = 2  # 设为较大的值
+        batch_size = 2  # batch_size 为8的倍数
+        nums_batch = 10
+
+        class RandomDataset(Dataset):
+            def __init__(self, num_samples):
+                self.num_samples = num_samples
+
+            def __getitem__(self, idx):
+                data = numpy.random.random([input_size]).astype('float16')
+                label = numpy.random.random([output_size]).astype('float16')
+                return data, label
+
+            def __len__(self):
+                return self.num_samples
+
+        dataset = RandomDataset(nums_batch * batch_size)
+        loader = paddle.io.DataLoader(
+            dataset,
+            batch_size=batch_size,
+            shuffle=False,
+            drop_last=True,
+            num_workers=0,
+        )
+
+        mse = paddle.nn.MSELoss()
+        model = SimpleNet(input_size, output_size)  # 定义模型
+        optimizer = paddle.optimizer.Momentum(
+            learning_rate=0.0001,
+            parameters=model.parameters(),
+            multi_precision=True,
+        )  # 定义优化器
+        scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
+        model = paddle.amp.decorate(models=model, level='O2')
+
+        for i, (data, label) in enumerate(loader):
+            with paddle.amp.auto_cast(level='O2'):
+                output = model(data)
+                loss = mse(output, label)
+            scaled = scaler.scale(loss)
+            scaled.backward()
+            scaler.step(optimizer)
+            scaler.update()
+            optimizer.clear_grad(set_to_zero=False)
+
+            if use_save_load and i == 5:
+                model_path = os.path.join(self.temp_dir.name, "model.pdparams")
+                optimizer_path = os.path.join(self.temp_dir.name, "opt.pdopt")
+                paddle.save(model.state_dict(), model_path)
+                paddle.save(optimizer.state_dict(), optimizer_path)
+                model.set_state_dict(paddle.load(model_path))
+                optimizer.set_state_dict(paddle.load(optimizer_path))
+
+        return loss.numpy()
+
+    def test_with_state_dict(self):
+        if core.is_compiled_with_cuda():
+            with base.dygraph.guard():
+                out_use_state_dict = self.check_with_opt_state_dict(
+                    use_save_load=True
+                )
+                out_no_state_dict = self.check_with_opt_state_dict(
+                    use_save_load=False
+                )
+            np.testing.assert_array_equal(out_use_state_dict, out_no_state_dict)
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_overlap_add_op.py b/test/legacy_test/test_overlap_add_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_overlap_add_op.py
rename to test/legacy_test/test_overlap_add_op.py
diff --git a/test/deprecated/legacy_test/test_pad3d_op.py b/test/legacy_test/test_pad3d_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_pad3d_op.py
rename to test/legacy_test/test_pad3d_op.py
diff --git a/test/deprecated/legacy_test/test_paddle_save_load_binary.py b/test/legacy_test/test_paddle_save_load_binary.py
similarity index 100%
rename from test/deprecated/legacy_test/test_paddle_save_load_binary.py
rename to test/legacy_test/test_paddle_save_load_binary.py
diff --git a/test/legacy_test/test_parallel_dygraph_dataparallel.py b/test/legacy_test/test_parallel_dygraph_dataparallel.py
index 648f6ddd97ef2..166687ce098e4 100644
--- a/test/legacy_test/test_parallel_dygraph_dataparallel.py
+++ b/test/legacy_test/test_parallel_dygraph_dataparallel.py
@@ -66,7 +66,7 @@ def start_local_trainers_cpu(
         proc_env = {
             "PADDLE_DISTRI_BACKEND": "gloo",
             "PADDLE_TRAINER_ID": "%d" % rank_id,
-            "PADDLE_CURRENT_ENDPOINT": "%s" % endpoint,
+            "PADDLE_CURRENT_ENDPOINT": f"{endpoint}",
             "PADDLE_TRAINERS_NUM": "%d" % n_rank,
             "PADDLE_TRAINER_ENDPOINTS": ",".join(trainer_endpoints),
         }
@@ -118,10 +118,11 @@ def start_local_trainers(
     procs = []
     for t in pod.trainers:
         proc_env = {
-            f"FLAGS_selected_{accelerator_type}s": "%s"
-            % ",".join([str(g) for g in t.gpus]),
+            f"FLAGS_selected_{accelerator_type}s": "{}".format(
+                ",".join([str(g) for g in t.gpus])
+            ),
             "PADDLE_TRAINER_ID": "%d" % t.rank,
-            "PADDLE_CURRENT_ENDPOINT": "%s" % t.endpoint,
+            "PADDLE_CURRENT_ENDPOINT": f"{t.endpoint}",
             "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
             "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()),
             "FLAGS_dynamic_static_unified_comm": "0",
diff --git a/test/legacy_test/test_parallel_dygraph_dataparallel_cpuonly.py b/test/legacy_test/test_parallel_dygraph_dataparallel_cpuonly.py
index 5a944284414bf..cd1b89e064d6e 100644
--- a/test/legacy_test/test_parallel_dygraph_dataparallel_cpuonly.py
+++ b/test/legacy_test/test_parallel_dygraph_dataparallel_cpuonly.py
@@ -66,7 +66,7 @@ def start_local_trainers(
     for t in pod.trainers:
         proc_env = {
             "PADDLE_TRAINER_ID": "%d" % t.rank,
-            "PADDLE_CURRENT_ENDPOINT": "%s" % t.endpoint,
+            "PADDLE_CURRENT_ENDPOINT": f"{t.endpoint}",
             "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
             "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()),
             "MASTER_ADDR": "127.0.0.1",
diff --git a/test/deprecated/legacy_test/test_partial_concat_op.py b/test/legacy_test/test_partial_concat_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_partial_concat_op.py
rename to test/legacy_test/test_partial_concat_op.py
diff --git a/test/deprecated/legacy_test/test_partial_sum_op.py b/test/legacy_test/test_partial_sum_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_partial_sum_op.py
rename to test/legacy_test/test_partial_sum_op.py
diff --git a/test/deprecated/legacy_test/test_pixel_shuffle_op.py b/test/legacy_test/test_pixel_shuffle_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_pixel_shuffle_op.py
rename to test/legacy_test/test_pixel_shuffle_op.py
diff --git a/test/legacy_test/test_pool1d_api.py b/test/legacy_test/test_pool1d_api.py
index 6fac04f468ebe..fce76e98f9f8e 100644
--- a/test/legacy_test/test_pool1d_api.py
+++ b/test/legacy_test/test_pool1d_api.py
@@ -115,6 +115,62 @@ def avg_pool1D_forward_naive(
     return out
 
 
+def lp_pool1D_forward_naive(
+    x,
+    ksize,
+    strides,
+    paddings,
+    global_pool=0,
+    ceil_mode=False,
+    data_format='NCL',
+    norm_type=None,
+):
+    assert norm_type is not None
+    if x.dtype == np.float16:
+        x = x.astype(np.float32)
+    if data_format == "NCL":
+        N, C, L = x.shape
+    else:
+        N, L, C = x.shape
+
+    if global_pool == 1:
+        ksize = [L]
+    L_out = (
+        (L - ksize[0] + 2 * paddings[0] + strides[0] - 1) // strides[0] + 1
+        if ceil_mode
+        else (L - ksize[0] + 2 * paddings[0]) // strides[0] + 1
+    )
+
+    if data_format == "NCL":
+        out = np.zeros((N, C, L_out))
+    else:
+        out = np.zeros((N, L_out, C))
+    for i in range(L_out):
+        r_start = np.max((i * strides[0] - paddings[0], 0))
+        r_end = np.min((i * strides[0] + ksize[0] - paddings[0], L))
+        if data_format == "NCL":
+            x_masked = x[:, :, r_start:r_end]
+        else:
+            x_masked = x[:, r_start:r_end, :]
+        if data_format == "NCL":
+            if norm_type == float('inf'):
+                out[:, :, i] = np.max(x_masked, axis=(2))
+            else:
+                out[:, :, i] = np.power(
+                    np.sum(np.power(x_masked, norm_type), axis=(2)),
+                    1 / norm_type,
+                )
+        else:
+            if norm_type == float('inf'):
+                out[:, i, :] = np.max(x_masked, axis=(1))
+            else:
+                out[:, i, :] = np.power(
+                    np.sum(np.power(x_masked, norm_type), axis=(1)),
+                    1 / norm_type,
+                )
+    return out
+
+
 class TestPool1D_API(unittest.TestCase):
     def setUp(self):
         np.random.seed(123)
@@ -296,6 +352,270 @@ def check_avg_dygraph_padding_same(self, place):
 
             np.testing.assert_allclose(result.numpy(), result_np, rtol=1e-05)
 
+    @test_with_pir_api
+    def check_lp_static_results(self, place):
+        with paddle.static.program_guard(paddle.static.Program()):
+            input = paddle.static.data(
+                name="input", shape=[2, 3, 32], dtype="float32"
+            )
+            result = F.lp_pool1d(
+                input, norm_type=2, kernel_size=2, stride=2, padding=0
+            )
+
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            result_np = lp_pool1D_forward_naive(
+                input_np,
+                ksize=[2],
+                strides=[2],
+                paddings=[0],
+                ceil_mode=False,
+                norm_type=2,
+            )
+
+            exe = paddle.static.Executor(place)
+            fetches = exe.run(
+                feed={"input": input_np},
+                fetch_list=[result],
+            )
+            np.testing.assert_allclose(fetches[0], result_np, rtol=1e-05)
+
+    @test_with_pir_api
+    def check_lp_static_results_fp16(self, place):
+        if core.is_compiled_with_cuda():
+            with paddle.static.program_guard(paddle.static.Program()):
+                input = paddle.static.data(
+                    name="input", shape=[2, 3, 32], dtype="float16"
+                )
+                result = F.lp_pool1d(
+                    input, norm_type=3, kernel_size=2, stride=2, padding=0
+                )
+
+                input_np = np.random.random([2, 3, 32]).astype("float16")
+                result_np = lp_pool1D_forward_naive(
+                    input_np,
+                    ksize=[2],
+                    strides=[2],
+                    paddings=[0],
+                    ceil_mode=False,
+                    norm_type=3,
+                )
+
+                place = paddle.CUDAPlace(0)
+                exe = paddle.static.Executor(place)
+                fetches = exe.run(
+                    feed={"input": input_np},
+                    fetch_list=[result],
+                )
+                np.testing.assert_allclose(
+                    fetches[0], result_np.astype(np.float16), rtol=1e-05
+                )
+
+    @test_with_pir_api
+    def check_lp_static_results_fp64(self, place):
+        if core.is_compiled_with_cuda():
+            with paddle.static.program_guard(paddle.static.Program()):
+                input = paddle.static.data(
+                    name="input", shape=[2, 3, 32], dtype="float64"
+                )
+                result = F.lp_pool1d(
+                    input, norm_type=3, kernel_size=2, stride=2, padding=0
+                )
+
+                input_np = np.random.random([2, 3, 32]).astype("float64")
+                result_np = lp_pool1D_forward_naive(
+                    input_np,
+                    ksize=[2],
+                    strides=[2],
+                    paddings=[0],
+                    ceil_mode=False,
+                    norm_type=3,
+                )
+
+                place = paddle.CUDAPlace(0)
+                exe = paddle.static.Executor(place)
+                fetches = exe.run(
+                    feed={"input": input_np},
+                    fetch_list=[result],
+                )
+                np.testing.assert_allclose(fetches[0], result_np, rtol=1e-05)
+
+    def check_lp_dygraph_results(self, place):
+        with base.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            input = paddle.to_tensor(input_np)
+            result = F.lp_pool1d(
+                input, norm_type=4, kernel_size=3, stride=2, padding=[1]
+            )
+
+            result_np = lp_pool1D_forward_naive(
+                input_np,
+                ksize=[3],
+                strides=[2],
+                paddings=[1],
+                norm_type=4,
+            )
+
+            np.testing.assert_allclose(result.numpy(), result_np, rtol=1e-05)
+
+            lp_pool1d_dg = paddle.nn.layer.LPPool1D(
+                norm_type=4, kernel_size=3, stride=2, padding=1
+            )
+            result = lp_pool1d_dg(input)
+            np.testing.assert_allclose(result.numpy(), result_np, rtol=1e-05)
+
+    def check_lp_dygraph_float16_results(self, place):
+        if isinstance(place, base.CUDAPlace):
+            with base.dygraph.guard(place):
+                input_np = np.random.random([2, 3, 32]).astype("float16")
+                input = paddle.to_tensor(input_np)
+                result = F.lp_pool1d(
+                    input, norm_type=5, kernel_size=5, stride=3, padding=[0]
+                )
+
+                result_np = lp_pool1D_forward_naive(
+                    input_np, ksize=[5], strides=[3], paddings=[0], norm_type=5
+                )
+
+                np.testing.assert_allclose(
+                    result.numpy(), result_np.astype(np.float16), rtol=1e-05
+                )
+
+                lp_pool1d_dg = paddle.nn.layer.LPPool1D(
+                    norm_type=5, kernel_size=5, stride=3, padding=0
+                )
+                result = lp_pool1d_dg(input)
+                np.testing.assert_allclose(
+                    result.numpy(), result_np.astype(np.float16), rtol=1e-05
+                )
+
+    def check_lp_dygraph_float64_results(self, place):
+        if isinstance(place, base.CUDAPlace):
+            with base.dygraph.guard(place):
+                input_np = np.random.random([2, 3, 32]).astype("float64")
+                input = paddle.to_tensor(input_np)
+                result = F.lp_pool1d(
+                    input, norm_type=5, kernel_size=5, stride=3, padding=[0]
+                )
+
+                result_np = lp_pool1D_forward_naive(
+                    input_np, ksize=[5], strides=[3], paddings=[0], norm_type=5
+                )
+
+                np.testing.assert_allclose(
+                    result.numpy(), result_np, rtol=1e-05
+                )
+
+                lp_pool1d_dg = paddle.nn.layer.LPPool1D(
+                    norm_type=5, kernel_size=5, stride=3, padding=0
+                )
+                result = lp_pool1d_dg(input)
+                np.testing.assert_allclose(
+                    result.numpy(), result_np, rtol=1e-05
+                )
+
+    def check_lp_dygraph_ceil_mode_results(self, place):
+        with base.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            input = paddle.to_tensor(input_np)
+            result = F.lp_pool1d(
+                input,
+                norm_type=7,
+                kernel_size=2,
+                stride=2,
+                padding=[1],
+                ceil_mode=True,
+            )
+
+            result_np = lp_pool1D_forward_naive(
+                input_np,
+                ksize=[2],
+                strides=[2],
+                paddings=[1],
+                ceil_mode=True,
+                norm_type=7,
+            )
+
+            np.testing.assert_allclose(result.numpy(), result_np, rtol=1e-05)
+
+            lp_pool1d_dg = paddle.nn.LPPool1D(
+                norm_type=7,
+                kernel_size=2,
+                stride=None,
+                ceil_mode=True,
+                padding=1,
+            )
+
+            result = lp_pool1d_dg(input)
+            np.testing.assert_allclose(result.numpy(), result_np, rtol=1e-05)
+
+    def check_lp_dygraph_data_format_results(self, place):
+        with base.dygraph.guard(place):
+            input_np = np.random.random([2, 32, 3]).astype("float32")
+            input = paddle.to_tensor(input_np)
+            result = F.lp_pool1d(
+                input,
+                norm_type=7,
+                kernel_size=2,
+                stride=2,
+                padding=[1],
+                ceil_mode=True,
+                data_format="NLC",
+            )
+
+            result_np = lp_pool1D_forward_naive(
+                input_np,
+                ksize=[2],
+                strides=[2],
+                paddings=[1],
+                ceil_mode=True,
+                data_format="NLC",
+                norm_type=7,
+            )
+
+            np.testing.assert_allclose(result.numpy(), result_np, rtol=1e-05)
+
+            lp_pool1d_dg = paddle.nn.LPPool1D(
+                norm_type=7,
+                kernel_size=2,
+                stride=None,
+                data_format="NLC",
+                padding=1,
+            )
+
+            result = lp_pool1d_dg(input)
+            np.testing.assert_allclose(result.numpy(), result_np, rtol=1e-05)
+
+    def check_lp_dygraph_inf_norm_type(self, place):
+        with base.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            input = paddle.to_tensor(input_np)
+            result = F.lp_pool1d(
+                input,
+                norm_type=float('inf'),
+                kernel_size=2,
+                stride=2,
+                padding=[1],
+                ceil_mode=True,
+            )
+
+            result_np = lp_pool1D_forward_naive(
+                input_np,
+                ksize=[2],
+                strides=[2],
+                paddings=[1],
+                ceil_mode=True,
+                norm_type=float("inf"),
+            )
+
+            np.testing.assert_allclose(result.numpy(), result_np, rtol=1e-05)
+
+            lp_pool1d_dg = paddle.nn.LPPool1D(
+                norm_type=float('inf'), kernel_size=2, stride=None, padding=1
+            )
+
+            result = lp_pool1d_dg(input)
+            np.testing.assert_allclose(result.numpy(), result_np, rtol=1e-05)
+
     def test_pool1d(self):
         for place in self.places:
             self.check_max_dygraph_results(place)
@@ -306,6 +626,15 @@ def test_pool1d(self):
             self.check_avg_dygraph_padding_same(place)
             self.check_max_dygraph_return_index_results(place)
             self.check_avg_static_results_fp16(place)
+            self.check_lp_static_results(place)
+            self.check_lp_dygraph_results(place)
+            self.check_lp_static_results_fp16(place)
+            self.check_lp_static_results_fp64(place)
+            self.check_lp_dygraph_inf_norm_type(place)
+            self.check_lp_dygraph_float16_results(place)
+            self.check_lp_dygraph_float64_results(place)
+            self.check_lp_dygraph_ceil_mode_results(place)
+            self.check_lp_dygraph_data_format_results(place)
 
 
 class TestPool1DError_API(unittest.TestCase):
diff --git a/test/legacy_test/test_pool2d_api.py b/test/legacy_test/test_pool2d_api.py
index ff4084d112301..f125bf7315a93 100644
--- a/test/legacy_test/test_pool2d_api.py
+++ b/test/legacy_test/test_pool2d_api.py
@@ -27,7 +27,7 @@
 import paddle
 from paddle import base
 from paddle.base import core
-from paddle.nn.functional import avg_pool2d, max_pool2d
+from paddle.nn.functional import avg_pool2d, lp_pool2d, max_pool2d
 from paddle.pir_utils import test_with_pir_api
 
 
@@ -360,6 +360,400 @@ def check_avg_divisor(self, place):
             result = avg_pool2d_dg(input)
             np.testing.assert_allclose(result.numpy(), result_np, rtol=1e-05)
 
+    def check_lp_static_results(self, place):
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            input = paddle.static.data(
+                name="input", shape=[2, 3, 128, 128], dtype="float32"
+            )
+            norm_type = 2
+            result = lp_pool2d(
+                input,
+                norm_type,
+                kernel_size=4,
+                stride=4,
+                ceil_mode=True,
+            )
+
+            input_np = np.random.random([2, 3, 128, 128]).astype("float32")
+            result_np = pool2D_forward_naive(
+                input_np,
+                ksize=[4, 4],
+                paddings=[0, 0],
+                strides=[4, 4],
+                ceil_mode=True,
+                norm_type=norm_type,
+                pool_type='lp',
+            )
+
+            exe = base.Executor(place)
+            fetches = exe.run(
+                feed={"input": input_np},
+                fetch_list=[result],
+            )
+            np.testing.assert_allclose(fetches[0], result_np, rtol=1e-05)
+
+    def check_lp_dygraph_results(self, place):
+        with base.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32]).astype("float32")
+            input = paddle.to_tensor(input_np)
+            norm_type = 2
+            result = lp_pool2d(
+                input,
+                norm_type,
+                kernel_size=2,
+                stride=1,
+                ceil_mode=False,
+            )
+
+            result_np = pool2D_forward_naive(
+                input_np,
+                ksize=[2, 2],
+                paddings=[0, 0],
+                strides=[1, 1],
+                ceil_mode=False,
+                norm_type=norm_type,
+                pool_type='lp',
+            )
+            np.testing.assert_allclose(result.numpy(), result_np, rtol=1e-05)
+
+            lp_pool2d_dg = paddle.nn.layer.LPPool2D(
+                norm_type=norm_type,
+                kernel_size=2,
+                stride=1,
+                ceil_mode=False,
+            )
+            result = lp_pool2d_dg(input)
+            np.testing.assert_allclose(result.numpy(), result_np, rtol=1e-05)
+
+    def check_lp_dygraph_results_norm_type_is_inf(self, place):
+        with base.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32]).astype("float32")
+            input = paddle.to_tensor(input_np)
+            norm_type = np.inf
+            result = lp_pool2d(
+                input,
+                norm_type,
+                kernel_size=[2, 4],
+                stride=2,
+                ceil_mode=False,
+            )
+
+            result_np = pool2D_forward_naive(
+                input_np,
+                ksize=[2, 4],
+                paddings=[0, 0],
+                strides=[2, 2],
+                ceil_mode=False,
+                norm_type=norm_type,
+                pool_type='lp',
+            )
+
+            np.testing.assert_allclose(result.numpy(), result_np, rtol=1e-05)
+
+            lp_pool2d_dg = paddle.nn.layer.LPPool2D(
+                norm_type=norm_type,
+                kernel_size=[2, 4],
+                stride=2,
+                ceil_mode=False,
+            )
+            result = lp_pool2d_dg(input)
+            np.testing.assert_allclose(result.numpy(), result_np, rtol=1e-05)
+
+    def check_lp_dygraph_results_norm_type_is_negative_inf(self, place):
+        with base.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32]).astype("float32")
+            input = paddle.to_tensor(input_np)
+            norm_type = -np.inf
+            result = lp_pool2d(
+                input,
+                norm_type,
+                kernel_size=2,
+                stride=2,
+                ceil_mode=False,
+            )
+
+            result_np = pool2D_forward_naive(
+                input_np,
+                ksize=[2, 2],
+                paddings=[0, 0],
+                strides=[2, 2],
+                ceil_mode=False,
+                norm_type=norm_type,
+                pool_type='lp',
+            )
+            np.testing.assert_allclose(result.numpy(), result_np, rtol=1e-05)
+
+            lp_pool2d_dg = paddle.nn.layer.LPPool2D(
+                norm_type=norm_type,
+                kernel_size=2,
+                stride=2,
+                ceil_mode=False,
+            )
+            result = lp_pool2d_dg(input)
+            np.testing.assert_allclose(result.numpy(), result_np, rtol=1e-05)
+
+    def check_lp_dygraph_ceilmode_results(self, place):
+        with base.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32]).astype("float32")
+            input = paddle.to_tensor(input_np)
+            norm_type = 2
+            result = lp_pool2d(
+                input,
+                norm_type,
+                kernel_size=5,
+                stride=3,
+                ceil_mode=True,
+            )
+
+            result_np = pool2D_forward_naive(
+                input_np,
+                ksize=[5, 5],
+                paddings=[0, 0],
+                strides=[3, 3],
+                ceil_mode=True,
+                norm_type=norm_type,
+                pool_type='lp',
+            )
+            np.testing.assert_allclose(result.numpy(), result_np, rtol=1e-05)
+
+            lp_pool2d_dg = paddle.nn.layer.LPPool2D(
+                norm_type=norm_type,
+                kernel_size=5,
+                stride=3,
+                ceil_mode=True,
+            )
+            result = lp_pool2d_dg(input)
+            np.testing.assert_allclose(result.numpy(), result_np, rtol=1e-05)
+
+    def check_lp_dygraph_nhwc_results(self, place):
+        with base.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32]).astype("float32")
+            input = paddle.to_tensor(np.transpose(input_np, [0, 2, 3, 1]))
+            norm_type = 2
+            result = lp_pool2d(
+                input,
+                norm_type,
+                kernel_size=2,
+                stride=2,
+                ceil_mode=False,
+                data_format="NHWC",
+            )
+            result_np = pool2D_forward_naive(
+                input_np,
+                ksize=[2, 2],
+                paddings=[0, 0],
+                strides=[2, 2],
+                ceil_mode=False,
+                norm_type=norm_type,
+                pool_type='lp',
+            )
+            np.testing.assert_allclose(
+                np.transpose(result.numpy(), [0, 3, 1, 2]),
+                result_np,
+                rtol=1e-05,
+            )
+            lp_pool2d_dg = paddle.nn.layer.LPPool2D(
+                norm_type=norm_type,
+                kernel_size=2,
+                stride=[2, 2],
+                ceil_mode=False,
+                data_format="NHWC",
+            )
+            result = lp_pool2d_dg(input)
+            np.testing.assert_allclose(
+                np.transpose(result.numpy(), [0, 3, 1, 2]),
+                result_np,
+                rtol=1e-05,
+            )
+
+    def check_lp_dygraph_stride_is_none(self, place):
+        with base.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32]).astype("float32")
+            input = paddle.to_tensor(input_np)
+            norm_type = 2
+            result = lp_pool2d(
+                input,
+                norm_type,
+                kernel_size=2,
+                stride=None,
+                ceil_mode=False,
+            )
+
+            result_np = pool2D_forward_naive(
+                input_np,
+                paddings=[0, 0],
+                ksize=[2, 2],
+                strides=[2, 2],
+                ceil_mode=False,
+                norm_type=norm_type,
+                pool_type='lp',
+            )
+            np.testing.assert_allclose(result.numpy(), result_np, rtol=1e-05)
+
+            lp_pool2d_dg = paddle.nn.layer.LPPool2D(
+                norm_type=norm_type,
+                kernel_size=2,
+                stride=None,
+                ceil_mode=False,
+            )
+            result = lp_pool2d_dg(input)
+            np.testing.assert_allclose(result.numpy(), result_np, rtol=1e-05)
+
+    def check_lp_float16_static(self, place):
+        if isinstance(place, base.CUDAPlace):
+            with paddle.static.program_guard(
+                paddle.static.Program(), paddle.static.Program()
+            ):
+                input = paddle.static.data(
+                    name="input", shape=[2, 3, 64, 64], dtype="float16"
+                )
+                norm_type = 2
+                result = lp_pool2d(
+                    input,
+                    norm_type,
+                    kernel_size=4,
+                    stride=[2, 4],
+                    ceil_mode=True,
+                )
+
+                input_np = np.random.random([2, 3, 64, 64]).astype("float16")
+                result_np = pool2D_forward_naive(
+                    input_np,
+                    ksize=[4, 4],
+                    paddings=[0, 0],
+                    strides=[2, 4],
+                    ceil_mode=True,
+                    norm_type=norm_type,
+                    pool_type='lp',
+                )
+
+                exe = base.Executor(place)
+                fetches = exe.run(
+                    feed={"input": input_np},
+                    fetch_list=[result],
+                )
+                np.testing.assert_allclose(
+                    fetches[0], result_np.astype(np.float16), rtol=1e-03
+                )
+
+    def check_lp_float64_static(self, place):
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            input = paddle.static.data(
+                name="input", shape=[2, 3, 64, 64], dtype="float64"
+            )
+            norm_type = 2
+            result = lp_pool2d(
+                input,
+                norm_type,
+                kernel_size=5,
+                stride=3,
+                ceil_mode=True,
+            )
+
+            input_np = np.random.random([2, 3, 64, 64]).astype("float64")
+            result_np = pool2D_forward_naive(
+                input_np,
+                ksize=[5, 5],
+                paddings=[0, 0],
+                strides=[3, 3],
+                ceil_mode=True,
+                norm_type=norm_type,
+                pool_type='lp',
+            )
+
+            exe = base.Executor(place)
+            fetches = exe.run(
+                feed={"input": input_np},
+                fetch_list=[result],
+            )
+            np.testing.assert_allclose(fetches[0], result_np, rtol=1e-05)
+
+    def check_lp_dygraph_float16(self, place):
+        if isinstance(place, base.CUDAPlace):
+            with base.dygraph.guard(place):
+                input_np = np.random.random([2, 3, 32, 32]).astype("float16")
+                input = paddle.to_tensor(input_np)
+                norm_type = 2
+                result = lp_pool2d(
+                    input,
+                    norm_type,
+                    kernel_size=3,
+                    stride=2,
+                    ceil_mode=False,
+                )
+
+                result_np = pool2D_forward_naive(
+                    input_np,
+                    ksize=[3, 3],
+                    paddings=[0, 0],
+                    strides=[2, 2],
+                    ceil_mode=False,
+                    norm_type=norm_type,
+                    pool_type='lp',
+                )
+                np.testing.assert_allclose(
+                    result.numpy(), result_np, rtol=1e-03
+                )
+
+                lp_pool2d_dg = paddle.nn.layer.LPPool2D(
+                    norm_type=norm_type,
+                    kernel_size=3,
+                    stride=2,
+                    ceil_mode=False,
+                )
+                result = lp_pool2d_dg(input)
+                np.testing.assert_allclose(
+                    result.numpy(), result_np.astype(np.float16), rtol=1e-03
+                )
+
+    def check_lp_dygraph_float64(self, place):
+        with base.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32]).astype("float64")
+            input = paddle.to_tensor(input_np)
+            norm_type = 2
+            result = lp_pool2d(
+                input,
+                norm_type,
+                kernel_size=5,
+                stride=3,
+                ceil_mode=False,
+            )
+
+            result_np = pool2D_forward_naive(
+                input_np,
+                ksize=[5, 5],
+                paddings=[0, 0],
+                strides=[3, 3],
+                ceil_mode=False,
+                norm_type=norm_type,
+                pool_type='lp',
+            )
+            np.testing.assert_allclose(result.numpy(), result_np, rtol=1e-05)
+
+            lp_pool2d_dg = paddle.nn.layer.LPPool2D(
+                norm_type=norm_type,
+                kernel_size=5,
+                stride=3,
+                ceil_mode=False,
+            )
+            result = lp_pool2d_dg(input)
+            np.testing.assert_allclose(result.numpy(), result_np, rtol=1e-05)
+
+    @test_with_pir_api
+    def test_pool2d_static(self):
+        paddle.enable_static()
+        for place in self.places:
+            self.check_max_static_results(place)
+            self.check_avg_static_results(place)
+            self.check_lp_static_results(place)
+            self.check_lp_float64_static(place)
+            self.check_lp_float16_static(place)
+        paddle.disable_static()
+
     def test_pool2d(self):
         for place in self.places:
             self.check_max_dygraph_results(place)
@@ -371,14 +765,14 @@ def test_pool2d(self):
             self.check_max_dygraph_padding_results(place)
             self.check_max_dygraph_ceilmode_results(place)
             self.check_max_dygraph_nhwc_results(place)
-
-    @test_with_pir_api
-    def test_pool2d_static(self):
-        paddle.enable_static()
-        for place in self.places:
-            self.check_max_static_results(place)
-            self.check_avg_static_results(place)
-        paddle.disable_static()
+            self.check_lp_dygraph_results(place)
+            self.check_lp_dygraph_stride_is_none(place)
+            self.check_lp_dygraph_ceilmode_results(place)
+            self.check_lp_dygraph_nhwc_results(place)
+            self.check_lp_dygraph_results_norm_type_is_inf(place)
+            self.check_lp_dygraph_results_norm_type_is_negative_inf(place)
+            self.check_lp_dygraph_float64(place)
+            self.check_lp_dygraph_float16(place)
 
 
 class TestPool2DError_API(unittest.TestCase):
@@ -630,6 +1024,16 @@ def run_zero_tuple_stride():
 
         self.assertRaises(ValueError, run_zero_tuple_stride)
 
+        def run_zero_norm_type():
+            with base.dygraph.guard():
+                array = np.array([1], dtype=np.float32)
+                x = paddle.to_tensor(
+                    np.reshape(array, [1, 1, 1, 1]), dtype='float32'
+                )
+                out = lp_pool2d(x, 0, 2)
+
+        self.assertRaises(ValueError, run_zero_norm_type)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/legacy_test/test_pool2d_op.py b/test/legacy_test/test_pool2d_op.py
index b2f10e3af1b26..8fc52ffec9c99 100644
--- a/test/legacy_test/test_pool2d_op.py
+++ b/test/legacy_test/test_pool2d_op.py
@@ -154,7 +154,11 @@ def pool2D_forward_naive(
     data_format='NCHW',
     pool_type="max",
     padding_algorithm="EXPLICIT",
+    norm_type=0,
 ):
+    if norm_type == float("inf"):
+        pool_type = 'max'
+
     # update paddings
     def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
         padding = []
@@ -273,6 +277,14 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
                     out[:, :, i, j] = np.sum(x_masked, axis=(2, 3)) / field_size
                 elif pool_type == 'max':
                     out[:, :, i, j] = np.max(x_masked, axis=(2, 3))
+                else:  # lp_pool2d
+                    if norm_type == 0:
+                        out[:, :, i, j] = 1
+                    else:
+                        out[:, :, i, j] = np.power(
+                            np.sum(np.power(x_masked, norm_type), axis=(2, 3)),
+                            1.0 / norm_type,
+                        )
             elif data_format == 'NHWC':
                 x_masked = x[:, in_h_start:in_h_end, in_w_start:in_w_end, :]
                 if pool_type == 'avg':
@@ -283,6 +295,14 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
                     out[:, i, j, :] = np.sum(x_masked, axis=(1, 2)) / field_size
                 elif pool_type == 'max':
                     out[:, i, j, :] = np.max(x_masked, axis=(1, 2))
+                else:  # lp_pool2d
+                    if norm_type == 0:
+                        out[:, i, j, :] = 1
+                    else:
+                        out[:, i, j, :] = np.power(
+                            np.sum(np.power(x_masked, norm_type), axis=(2, 3)),
+                            1.0 / norm_type,
+                        )
     return out
 
 
@@ -348,6 +368,37 @@ def pool2d_wrapper_use_cudnn(
     )
 
 
+def lp_pool2d_wrapper(
+    X,
+    ksize=[],
+    strides=[],
+    paddings=[],
+    ceil_mode=False,
+    exclusive=True,
+    data_format="NCDHW",
+    pooling_type="lp",
+    global_pooling=False,
+    adaptive=False,
+    padding_algorithm="EXPLICIT",
+):
+    if data_format == "AnyLayout":
+        data_format = "NCDHW"
+    return paddle._C_ops.lp_pool2d(
+        X,
+        ksize,
+        strides,
+        paddings,
+        ceil_mode,
+        exclusive,
+        data_format,
+        pooling_type,
+        global_pooling,
+        adaptive,
+        padding_algorithm,
+        2,
+    )
+
+
 class TestPool2D_Op_Mixin:
     def setUp(self):
         self.op_type = "pool2d"
@@ -503,6 +554,85 @@ class TestPool2D_Op(TestPool2D_Op_Mixin, OpTest):
     pass
 
 
+class TestLPPool2D_Op(TestPool2D_Op):
+    def setUp(self):
+        self.op_type = "lp_pool2d"
+        self.use_cudnn = False
+        self.init_kernel_type()
+        self.use_mkldnn = False
+        self.init_data_type()
+        self.init_test_case()
+        self.padding_algorithm = "EXPLICIT"
+        self.init_paddings()
+        self.init_global_pool()
+        self.init_kernel_type()
+        self.init_ceil_mode()
+        self.init_exclusive()
+        self.init_adaptive()
+        self.init_data_format()
+        self.init_shape()
+        self.norm_type = 2
+        self.pool_type = 'lp'
+
+        if self.is_bfloat16_op():
+            input = np.random.random(self.shape).astype(np.float32)
+        else:
+            input = np.random.random(self.shape).astype(self.dtype)
+
+        output = pool2D_forward_naive(
+            input,
+            self.ksize,
+            self.strides,
+            self.paddings,
+            self.global_pool,
+            self.ceil_mode,
+            self.exclusive,
+            self.adaptive,
+            self.data_format,
+            self.pool_type,
+            self.padding_algorithm,
+            self.norm_type,
+        )
+
+        if self.is_bfloat16_op():
+            output = convert_float_to_uint16(output)
+            self.inputs = {'x': convert_float_to_uint16(input)}
+        else:
+            output = output.astype(self.dtype)
+            self.inputs = {'x': OpTest.np_dtype_to_base_dtype(input)}
+
+        self.attrs = {
+            'strides': self.strides,
+            'paddings': self.paddings,
+            'kernel_size': self.ksize,
+            'pooling_type': self.pool_type,
+            'global_pooling': self.global_pool,
+            'ceil_mode': self.ceil_mode,
+            'data_format': self.data_format,
+            "padding_algorithm": self.padding_algorithm,
+            'norm_type': self.norm_type,
+        }
+
+        self.outputs = {'out': output}
+
+        self.python_api = lp_pool2d_wrapper
+
+    def has_cudnn(self):
+        return False
+
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
+        self.check_grad(
+            {'x'},
+            'out',
+            max_relative_error=0.07,
+            check_dygraph=(not self.use_mkldnn),
+            check_pir=True,
+            check_pir_onednn=self.check_pir_onednn,
+        )
+
+
 class TestCase1(TestPool2D_Op):
     def init_test_case(self):
         self.ksize = [3, 3]
diff --git a/test/deprecated/legacy_test/test_pool3d_op.py b/test/legacy_test/test_pool3d_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_pool3d_op.py
rename to test/legacy_test/test_pool3d_op.py
diff --git a/test/deprecated/legacy_test/test_prelu_op.py b/test/legacy_test/test_prelu_op.py
similarity index 88%
rename from test/deprecated/legacy_test/test_prelu_op.py
rename to test/legacy_test/test_prelu_op.py
index 75bb9b69beed4..aecc3af208225 100644
--- a/test/deprecated/legacy_test/test_prelu_op.py
+++ b/test/legacy_test/test_prelu_op.py
@@ -20,7 +20,7 @@
 import paddle
 import paddle.nn.functional as F
 from paddle import base
-from paddle.base import Program, core
+from paddle.base import core
 from paddle.pir_utils import test_with_pir_api
 
 
@@ -481,65 +481,5 @@ def test_check_grad(self):
 create_test_bf16_class(TestModeElementRank3NHWC)
 create_test_bf16_class(TestModeElementRank6NHWC)
 
-
-def prelu_t(x, mode, param_attr=None, name=None, data_format='NCHW'):
-    helper = base.layer_helper.LayerHelper('prelu', **locals())
-    alpha_shape = [1, x.shape[1], 1, 1]
-    dtype = helper.input_dtype(input_param_name='x')
-    alpha = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=alpha_shape,
-        dtype='float32',
-        is_bias=False,
-        default_initializer=paddle.nn.initializer.Constant(0.25),
-    )
-    out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type="prelu",
-        inputs={"X": x, 'Alpha': alpha},
-        attrs={"mode": mode, 'data_format': data_format},
-        outputs={"Out": out},
-    )
-    return out
-
-
-# error message test if mode is not one of 'all', 'channel', 'element'
-class TestModeError(unittest.TestCase):
-    def setUp(self):
-        self.place = (
-            paddle.CUDAPlace(0)
-            if core.is_compiled_with_cuda()
-            else paddle.CPUPlace()
-        )
-        self.x_np = np.ones([1, 2, 3, 4]).astype('float32')
-
-    def test_mode_error(self):
-        main_program = Program()
-        with base.program_guard(main_program, Program()):
-            x = paddle.static.data(name='x', shape=[2, 3, 4, 5])
-            try:
-                y = prelu_t(x, 'any')
-            except Exception as e:
-                assert e.args[0].find('InvalidArgument') != -1
-
-    def test_data_format_error1(self):
-        main_program = Program()
-        with base.program_guard(main_program, Program()):
-            x = paddle.static.data(name='x', shape=[2, 3, 4, 5])
-            try:
-                y = prelu_t(x, 'channel', data_format='N')
-            except Exception as e:
-                assert e.args[0].find('InvalidArgument') != -1
-
-    def test_data_format_error2(self):
-        main_program = Program()
-        with base.program_guard(main_program, Program()):
-            x = paddle.static.data(name='x', shape=[2, 3, 4, 5])
-            try:
-                y = paddle.static.nn.prelu(x, 'channel', data_format='N')
-            except ValueError as e:
-                pass
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/deprecated/legacy_test/test_put_along_axis_op.py b/test/legacy_test/test_put_along_axis_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_put_along_axis_op.py
rename to test/legacy_test/test_put_along_axis_op.py
diff --git a/test/deprecated/legacy_test/test_qr_op.py b/test/legacy_test/test_qr_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_qr_op.py
rename to test/legacy_test/test_qr_op.py
diff --git a/test/deprecated/legacy_test/test_random_seed.py b/test/legacy_test/test_random_seed.py
similarity index 87%
rename from test/deprecated/legacy_test/test_random_seed.py
rename to test/legacy_test/test_random_seed.py
index ead15119a9922..8fbaf9a3d6942 100644
--- a/test/deprecated/legacy_test/test_random_seed.py
+++ b/test/legacy_test/test_random_seed.py
@@ -359,57 +359,6 @@ def test_generator_randperm_static(self):
                 np.testing.assert_allclose(out1_res2, out2_res2, rtol=1e-05)
                 self.assertTrue(not np.allclose(out1_res2, out1_res1))
 
-    def test_gen_TruncatedNormal_initializer(self):
-        base.disable_dygraph()
-
-        gen = paddle.seed(123123143)
-        cur_state = gen.get_state()
-
-        startup_program = base.Program()
-        train_program = base.Program()
-        with base.program_guard(train_program, startup_program):
-            # example 1:
-            # attr shape is a list which doesn't contain tensor Variable.
-            x = paddle.uniform(shape=[2, 10])
-            result_1 = paddle.static.nn.fc(
-                x,
-                size=10,
-                weight_attr=paddle.nn.initializer.TruncatedNormal(
-                    mean=0.0, std=2.0
-                ),
-            )
-            result_2 = paddle.static.nn.fc(
-                x,
-                size=10,
-                weight_attr=paddle.nn.initializer.TruncatedNormal(
-                    mean=0.0, std=2.0
-                ),
-            )
-
-            exe = base.Executor(base.CPUPlace())
-            exe.run(startup_program)
-            out1 = exe.run(
-                train_program, feed={}, fetch_list=[result_1, result_2]
-            )
-
-        gen.manual_seed(123123143)
-        with base.program_guard(train_program, startup_program):
-            exe.run(startup_program)
-            out2 = exe.run(
-                train_program, feed={}, fetch_list=[result_1, result_2]
-            )
-
-        out1_res1 = np.array(out1[0])
-        out1_res2 = np.array(out1[1])
-        out2_res1 = np.array(out2[0])
-        out2_res2 = np.array(out2[1])
-
-        if not core.is_compiled_with_cuda():
-            print(">>>>>>> sampling id static >>>>>>>")
-            np.testing.assert_allclose(out1_res1, out2_res1, rtol=1e-05)
-            np.testing.assert_allclose(out1_res2, out2_res2, rtol=1e-05)
-            self.assertTrue(not np.allclose(out1_res2, out1_res1))
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/legacy_test/test_regularizer.py b/test/legacy_test/test_regularizer.py
new file mode 100644
index 0000000000000..a85f2fcb075da
--- /dev/null
+++ b/test/legacy_test/test_regularizer.py
@@ -0,0 +1,217 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+import random
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle import base, regularizer
+from paddle.base import core
+from paddle.pir_utils import test_with_pir_api
+
+
+class TestL1Decay(unittest.TestCase):
+    def test_l1decay_regularizer(self):
+        with paddle.pir_utils.IrGuard():
+            main_program = paddle.static.Program()
+            with paddle.static.program_guard(main_program):
+                block = main_program.global_block()
+                mul_x = paddle.pir.core.create_parameter(
+                    dtype="float32",
+                    shape=[5, 10],
+                    name="mul.x",
+                    regularizer=regularizer.L1Decay(0.5),
+                    initializer=paddle.nn.initializer.Constant(1),
+                )
+                self.assertIsNotNone(mul_x.regularizer)
+                self.assertTrue(
+                    isinstance(mul_x.regularizer, regularizer.L1Decay)
+                )
+
+                mul_y = paddle.static.data(
+                    dtype="float32", shape=[10, 8], name="mul.y"
+                )
+                mul_out = paddle.matmul(mul_x, mul_y)
+                mean_out = paddle.mean(mul_out)
+                grads = paddle.autograd.ir_backward.grad(mean_out, [mul_x])
+                params_grads = [(mul_x, grads[0])]
+                self.assertEqual(len(params_grads), 1)
+                count_ops = len(block.ops)
+                optimizer = paddle.optimizer.Adam()
+                params_grads = optimizer.append_regularization_ops(params_grads)
+                self.assertEqual(len(params_grads), 1)
+                self.assertEqual(len(block.ops), count_ops + 5)
+                self.assertEqual(block.ops[-1].name(), 'pd_op.add_n')
+                self.assertEqual(block.ops[-3].name(), 'pd_op.scale')
+                self.assertEqual(block.ops[-5].name(), 'pd_op.sign')
+
+
+class TestRegularizer(unittest.TestCase):
+    def setUp(self):
+        self.word_len = 1500
+        self.train_data = [
+            [(random.sample(range(1000), 10), [0])] for _ in range(2)
+        ]
+
+    def get_places(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        return places
+
+    @contextlib.contextmanager
+    def scope_prog_guard(self, main_prog, startup_prog):
+        scope = base.core.Scope()
+        with base.unique_name.guard():
+            with base.scope_guard(scope):
+                with base.program_guard(main_prog, startup_prog):
+                    yield
+
+    def run_program(self, place, feed_list):
+        exe = base.Executor(place)
+        feeder = base.DataFeeder(feed_list=feed_list, place=place)
+        exe.run(base.default_startup_program())
+
+        main_prog = base.default_main_program()
+        param_list = [var.name for var in main_prog.block(0).all_parameters()]
+
+        param_sum = []
+        for data in self.train_data:
+            out = exe.run(
+                main_prog, feed=feeder.feed(data), fetch_list=param_list
+            )
+            p_sum = 0
+            for v in out:
+                p_sum += np.sum(np.abs(v))
+            param_sum.append(p_sum)
+        return param_sum
+
+    def check_l2decay_regularizer(self, place, model):
+        paddle.seed(1)
+        paddle.framework.random._manual_program_seed(1)
+        main_prog = base.framework.Program()
+        startup_prog = base.framework.Program()
+        with self.scope_prog_guard(
+            main_prog=main_prog, startup_prog=startup_prog
+        ):
+            data = paddle.static.data(
+                name="words", shape=[-1, 1], dtype="int64", lod_level=1
+            )
+            label = paddle.static.data(
+                name="label", shape=[-1, 1], dtype="int64"
+            )
+
+            avg_cost = model(data, label, self.word_len)
+
+            optimizer = paddle.optimizer.Adagrad(
+                learning_rate=0.1,
+                weight_decay=paddle.regularizer.L2Decay(1.0),
+            )
+            optimizer.minimize(avg_cost)
+            param_sum = self.run_program(place, [data, label])
+        return param_sum
+
+    def check_l2decay(self, place, model):
+        paddle.seed(1)
+        paddle.framework.random._manual_program_seed(1)
+        main_prog = base.framework.Program()
+        startup_prog = base.framework.Program()
+
+        with self.scope_prog_guard(
+            main_prog=main_prog, startup_prog=startup_prog
+        ):
+            data = paddle.static.data(
+                name="words", shape=[-1, 1], dtype="int64", lod_level=1
+            )
+            label = paddle.static.data(
+                name="label", shape=[-1, 1], dtype="int64"
+            )
+
+            avg_cost_l2 = model(data, label, self.word_len)
+
+            param_list = base.default_main_program().block(0).all_parameters()
+            para_sum = []
+            for para in param_list:
+                para_mul = paddle.square(x=para)
+                para_sum.append(paddle.sum(para_mul))
+            avg_cost_l2 += paddle.add_n(para_sum) * 0.5
+
+            optimizer = paddle.optimizer.Adagrad(learning_rate=0.1)
+            optimizer.minimize(avg_cost_l2)
+            param_sum = self.run_program(place, [data, label])
+        return param_sum
+
+    @test_with_pir_api
+    def test_repeated_regularization(self):
+        l1 = paddle.regularizer.L1Decay(coeff=0.1)
+        l2 = paddle.regularizer.L2Decay(coeff=0.01)
+        fc_param_attr = paddle.ParamAttr(
+            regularizer=paddle.regularizer.L1Decay()
+        )
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            x = paddle.uniform([2, 2, 3])
+            linear = paddle.nn.Linear(3, 5, weight_attr=fc_param_attr)
+            out = linear(x)
+            loss = paddle.sum(out)
+            sgd = paddle.optimizer.SGD(learning_rate=0.1, weight_decay=l2)
+            sgd.minimize(loss)
+        with base.dygraph.guard():
+            input = paddle.to_tensor(np.random.randn(3, 2).astype('float32'))
+            paddle.seed(1)
+            paddle.framework.random._manual_program_seed(1)
+
+            linear1 = paddle.nn.Linear(
+                2, 2, weight_attr=fc_param_attr, bias_attr=fc_param_attr
+            )
+            linear2 = paddle.nn.Linear(
+                2, 2, weight_attr=fc_param_attr, bias_attr=fc_param_attr
+            )
+
+            loss1 = linear1(input)
+            loss1.backward()
+            # set l2 regularizer in optimizer, but l1 in base.ParamAttr
+
+            paddle.optimizer.SGD(
+                parameters=linear1.parameters(),
+                learning_rate=1e-2,
+                weight_decay=l2,
+            ).minimize(loss1)
+            # only set l1 in base.ParamAttr
+            loss2 = linear2(input)
+            loss2.backward()
+            paddle.optimizer.SGD(
+                parameters=linear2.parameters(), learning_rate=1e-2
+            ).minimize(loss2)
+            # they should both be applied by l1, and keep the same
+            np.testing.assert_allclose(
+                linear1.weight.numpy(),
+                linear2.weight.numpy(),
+                rtol=1e-05,
+                err_msg='weight should use the regularization in base.ParamAttr!',
+            )
+            np.testing.assert_allclose(
+                linear1.bias.numpy(),
+                linear2.bias.numpy(),
+                rtol=1e-05,
+                err_msg='bias should use the regularization in base.ParamAttr!',
+            )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_regularizer_api.py b/test/legacy_test/test_regularizer_api.py
similarity index 77%
rename from test/deprecated/legacy_test/test_regularizer_api.py
rename to test/legacy_test/test_regularizer_api.py
index 32a98c5a72091..f6d3507628463 100644
--- a/test/deprecated/legacy_test/test_regularizer_api.py
+++ b/test/legacy_test/test_regularizer_api.py
@@ -15,7 +15,6 @@
 import contextlib
 import random
 import unittest
-from functools import partial
 
 import numpy as np
 
@@ -25,41 +24,6 @@
 from paddle.pir_utils import test_with_pir_api
 
 
-def bow_net(
-    data,
-    label,
-    dict_dim,
-    is_sparse=False,
-    emb_dim=8,
-    hid_dim=8,
-    hid_dim2=6,
-    class_dim=2,
-):
-    """
-    BOW net
-    This model is from https://github.com/PaddlePaddle/models:
-    base/PaddleNLP/text_classification/nets.py
-    """
-    emb = paddle.static.nn.embedding(
-        input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim]
-    )
-    bow = paddle.static.nn.sequence_lod.sequence_pool(
-        input=emb, pool_type='sum'
-    )
-    bow_tanh = paddle.tanh(bow)
-    fc_1 = paddle.static.nn.fc(x=bow_tanh, size=hid_dim, activation="tanh")
-    fc_2 = paddle.static.nn.fc(x=fc_1, size=hid_dim2, activation="tanh")
-    prediction = paddle.static.nn.fc(
-        x=[fc_2], size=class_dim, activation="softmax"
-    )
-    cost = paddle.nn.functional.cross_entropy(
-        input=prediction, label=label, reduction='none', use_softmax=False
-    )
-    avg_cost = paddle.mean(x=cost)
-
-    return avg_cost
-
-
 class TestRegularizer(unittest.TestCase):
     def setUp(self):
         self.word_len = 1500
@@ -155,27 +119,6 @@ def check_l2decay(self, place, model):
             param_sum = self.run_program(place, [data, label])
         return param_sum
 
-    def test_l2(self):
-        paddle.enable_static()
-        for place in self.get_places():
-            dense_sparse_p_sum = []
-            for sparse in [True, False]:
-                model = partial(bow_net, is_sparse=sparse)
-                framework_l2 = self.check_l2decay_regularizer(place, model)
-                l2 = self.check_l2decay(place, model)
-                assert len(l2) == len(framework_l2)
-                for i in range(len(l2)):
-                    assert np.isclose(a=framework_l2[i], b=l2[i], rtol=5e-5)
-                dense_sparse_p_sum.append(framework_l2)
-
-            assert len(dense_sparse_p_sum[0]) == len(dense_sparse_p_sum[1])
-            for i in range(len(dense_sparse_p_sum[0])):
-                assert np.isclose(
-                    a=dense_sparse_p_sum[0][i],
-                    b=dense_sparse_p_sum[1][i],
-                    rtol=5e-5,
-                )
-
     @test_with_pir_api
     def test_repeated_regularization(self):
         paddle.enable_static()
diff --git a/test/deprecated/legacy_test/test_repeat_interleave_op.py b/test/legacy_test/test_repeat_interleave_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_repeat_interleave_op.py
rename to test/legacy_test/test_repeat_interleave_op.py
diff --git a/test/deprecated/legacy_test/test_reshape_op.py b/test/legacy_test/test_reshape_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_reshape_op.py
rename to test/legacy_test/test_reshape_op.py
diff --git a/test/deprecated/legacy_test/test_reverse_op.py b/test/legacy_test/test_reverse_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_reverse_op.py
rename to test/legacy_test/test_reverse_op.py
diff --git a/test/deprecated/legacy_test/test_roi_align_op.py b/test/legacy_test/test_roi_align_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_roi_align_op.py
rename to test/legacy_test/test_roi_align_op.py
diff --git a/test/deprecated/legacy_test/test_roi_pool_op.py b/test/legacy_test/test_roi_pool_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_roi_pool_op.py
rename to test/legacy_test/test_roi_pool_op.py
diff --git a/test/deprecated/legacy_test/test_roll_op.py b/test/legacy_test/test_roll_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_roll_op.py
rename to test/legacy_test/test_roll_op.py
diff --git a/test/deprecated/legacy_test/test_row_conv_op.py b/test/legacy_test/test_row_conv_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_row_conv_op.py
rename to test/legacy_test/test_row_conv_op.py
diff --git a/test/deprecated/legacy_test/test_save_inference_model_conditional_op.py b/test/legacy_test/test_save_inference_model_conditional_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_save_inference_model_conditional_op.py
rename to test/legacy_test/test_save_inference_model_conditional_op.py
diff --git a/test/deprecated/legacy_test/test_save_model_without_var.py b/test/legacy_test/test_save_model_without_var.py
similarity index 100%
rename from test/deprecated/legacy_test/test_save_model_without_var.py
rename to test/legacy_test/test_save_model_without_var.py
diff --git a/test/deprecated/legacy_test/test_scatter_op.py b/test/legacy_test/test_scatter_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_scatter_op.py
rename to test/legacy_test/test_scatter_op.py
diff --git a/test/deprecated/legacy_test/test_selu_op.py b/test/legacy_test/test_selu_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_selu_op.py
rename to test/legacy_test/test_selu_op.py
diff --git a/test/deprecated/legacy_test/test_sgd_op.py b/test/legacy_test/test_sgd_op.py
similarity index 60%
rename from test/deprecated/legacy_test/test_sgd_op.py
rename to test/legacy_test/test_sgd_op.py
index 20f67faf44f3f..d1e81b11c67e8 100644
--- a/test/deprecated/legacy_test/test_sgd_op.py
+++ b/test/legacy_test/test_sgd_op.py
@@ -20,7 +20,6 @@
 from utils import dygraph_guard
 
 import paddle
-from paddle import base
 from paddle.base import core
 
 paddle.enable_static()
@@ -204,32 +203,6 @@ def test_sparse_parameter_sgd(self):
             self.check_with_place(place)
 
 
-class TestSGDOpWithLargeInput(unittest.TestCase):
-    def runTest(self):
-        paddle.enable_static()
-        data = paddle.tensor.fill_constant(shape=[1], value=128, dtype='int64')
-        label = paddle.tensor.fill_constant(
-            shape=[1, 150], value=0.5, dtype='float32'
-        )
-        emb = paddle.static.nn.embedding(
-            input=data, size=(10000000, 150), dtype='float32'
-        )
-        out = paddle.nn.functional.normalize(x=emb, axis=-1)
-
-        cost = paddle.nn.functional.square_error_cost(input=out, label=label)
-        avg_cost = paddle.mean(cost)
-        sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.001)
-        sgd_optimizer.minimize(avg_cost)
-
-        place = base.CPUPlace()
-        exe = base.Executor(place)
-        exe.run(base.default_startup_program())
-        compiled_prog = base.compiler.CompiledProgram(
-            base.default_main_program()
-        )
-        result = exe.run(compiled_prog, fetch_list=[avg_cost])
-
-
 class TestSGDV2(unittest.TestCase):
     def test_sgd_dygraph(self):
         paddle.disable_static()
@@ -247,50 +220,6 @@ def test_sgd_dygraph(self):
         adam.step()
         adam.clear_gradients()
 
-    def test_sgd(self):
-        paddle.enable_static()
-
-        def check_sgd_optimizer(optimizer_attr):
-            init_program = paddle.static.Program()
-            program = paddle.static.Program()
-            block = program.global_block()
-            mul_x = block.create_parameter(
-                dtype="float32",
-                shape=[5, 10],
-                lod_level=0,
-                name="mul.x",
-                optimize_attr=optimizer_attr,
-            )
-            mul_y = block.create_var(
-                dtype="float32", shape=[10, 8], lod_level=0, name="mul.y"
-            )
-            mul_out = block.create_var(
-                dtype="float32", shape=[5, 8], lod_level=0, name="mul.out"
-            )
-            mean_out = block.create_var(
-                dtype="float32", shape=[1], lod_level=0, name="mean.out"
-            )
-            block.append_op(
-                type="mul",
-                inputs={"X": mul_x, "Y": mul_y},
-                outputs={"Out": mul_out},
-                attrs={"x_num_col_dims": 1},
-            )
-            block.append_op(
-                type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out}
-            )
-            sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.01)
-            opts, _ = sgd_optimizer.minimize(mean_out, init_program)
-            return opts
-
-        opts = check_sgd_optimizer({'learning_rate': 1.1})
-        self.assertEqual(len(opts), 2)
-        self.assertEqual([op.type for op in opts], ["scale", "sgd"])
-
-        opts = check_sgd_optimizer({'learning_rate': 1.0})
-        self.assertEqual(len(opts), 1)
-        self.assertEqual([op.type for op in opts], ["sgd"])
-
     def test_raise_error(self):
         self.assertRaises(ValueError, paddle.optimizer.SGD, learning_rate=None)
 
@@ -320,114 +249,6 @@ def test_sgd_group_dygraph(self):
         adam.clear_gradients()
 
 
-class TestSGDMultiPrecision2_0(unittest.TestCase):
-    def dygraph_sgd_mp(self, mp):
-        paddle.disable_static()
-        paddle.seed(10)
-        paddle.set_device('gpu')
-        input = paddle.randn((2, 2))
-        model = paddle.nn.Linear(2, 2)
-        optimizer = paddle.optimizer.SGD(
-            parameters=model.parameters(), multi_precision=mp
-        )
-        if mp:
-            model = paddle.amp.decorate(models=model, level='O2')
-            scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
-
-        for idx in range(5):
-            if mp:
-                with paddle.amp.auto_cast(level='O2'):
-                    output = model(input)
-                    loss = paddle.mean(output)
-                scaled = scaler.scale(loss)
-                scaled.backward()
-                scaler.minimize(optimizer, scaled)
-                optimizer.clear_grad()
-            else:
-                output = model(input)
-                loss = paddle.mean(output)
-                optimizer.step()
-                optimizer.clear_grad()
-
-        return output, model.parameters()
-
-    def static_sgd_mp(self, mp):
-        paddle.enable_static()
-        paddle.seed(10)
-        np.random.seed(10)
-        exe = paddle.static.Executor('gpu')
-        train_program = paddle.static.Program()
-        startup_program = paddle.static.Program()
-        optimizer = paddle.optimizer.SGD(multi_precision=mp)
-
-        if mp:
-            optimizer = paddle.static.amp.decorate(
-                optimizer,
-                init_loss_scaling=128.0,
-                use_dynamic_loss_scaling=True,
-                use_pure_fp16=True,
-                use_fp16_guard=False,
-            )
-        with paddle.static.program_guard(train_program, startup_program):
-            if mp:
-                data = paddle.static.data(
-                    shape=[2, 2], name='X', dtype='float16'
-                )
-            else:
-                data = paddle.static.data(
-                    shape=[2, 2], name='X', dtype='float32'
-                )
-            hidden = paddle.static.nn.fc(x=data, size=10)
-            loss = paddle.mean(hidden)
-            optimizer.minimize(loss)
-        exe.run(startup_program)
-
-        if mp:
-            optimizer.amp_init(
-                place=paddle.CUDAPlace(0), scope=paddle.static.global_scope()
-            )
-            x = np.random.random(size=(2, 2)).astype('float16')
-        else:
-            x = np.random.random(size=(2, 2)).astype('float32')
-        out = []
-        for idx in range(5):
-            (loss_data,) = exe.run(
-                train_program, feed={"X": x}, fetch_list=[loss]
-            )
-            out.append(loss_data)
-        return out
-
-    def test_main(self):
-        if not paddle.is_compiled_with_cuda():
-            return
-        "Test dygraph mode"
-        output1_dy, params1_dy = self.dygraph_sgd_mp(mp=True)
-        output2_dy, params2_dy = self.dygraph_sgd_mp(mp=False)
-        np.testing.assert_allclose(
-            output1_dy.astype('float32').numpy(),
-            output2_dy.astype('float32').numpy(),
-            rtol=1e-05,
-            atol=0.1,
-        )
-        for idx in range(len(params1_dy)):
-            np.testing.assert_allclose(
-                params1_dy[idx].astype('float32').numpy(),
-                params2_dy[idx].astype('float32').numpy(),
-                rtol=1e-05,
-                atol=0.1,
-            )
-        "Test static graph mode"
-        output1_st = self.static_sgd_mp(mp=True)
-        output2_st = self.static_sgd_mp(mp=False)
-        for idx in range(len(output1_st)):
-            np.testing.assert_allclose(
-                output1_st[idx].astype('float32'),
-                output2_st[idx].astype('float32'),
-                rtol=1e-05,
-                atol=0.1,
-            )
-
-
 class TestSGDSimple(unittest.TestCase):
     def setUp(self) -> None:
         self.data = np.random.random(size=(2, 2)).astype('float32')
diff --git a/test/deprecated/legacy_test/test_shuffle_channel_op.py b/test/legacy_test/test_shuffle_channel_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_shuffle_channel_op.py
rename to test/legacy_test/test_shuffle_channel_op.py
diff --git a/test/deprecated/legacy_test/test_sigmoid_cross_entropy_with_logits_grad_with_auto_grad.py b/test/legacy_test/test_sigmoid_cross_entropy_with_logits_grad_with_auto_grad.py
similarity index 100%
rename from test/deprecated/legacy_test/test_sigmoid_cross_entropy_with_logits_grad_with_auto_grad.py
rename to test/legacy_test/test_sigmoid_cross_entropy_with_logits_grad_with_auto_grad.py
diff --git a/test/deprecated/legacy_test/test_sign_op.py b/test/legacy_test/test_sign_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_sign_op.py
rename to test/legacy_test/test_sign_op.py
diff --git a/test/deprecated/legacy_test/test_solve_op.py b/test/legacy_test/test_solve_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_solve_op.py
rename to test/legacy_test/test_solve_op.py
diff --git a/test/legacy_test/test_sparse_mask_as_op.py b/test/legacy_test/test_sparse_mask_as_op.py
new file mode 100644
index 0000000000000..f4cd639452b5d
--- /dev/null
+++ b/test/legacy_test/test_sparse_mask_as_op.py
@@ -0,0 +1,159 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+
+
+def generate_data(shape, dtype):
+    """
+    Generate `data` and `mask` with the same shape and dtype.
+    """
+    _mask = np.random.randint(0, 2, shape)
+    if np.sum(_mask) == 0:
+        _mask.flat[0] = 1
+    mask = (np.random.randint(-100, 100, shape) * _mask).astype(dtype)
+    data = np.random.randint(-100, 100, shape).astype(dtype)
+    return data, mask
+
+
+class TestMaskAs(unittest.TestCase):
+    def setUp(self):
+        self.init_format()
+        self.places = [paddle.CPUPlace()]
+        if paddle.is_compiled_with_cuda():
+            self.places.append(paddle.CUDAPlace(0))
+
+    def init_format(self):
+        self.format = None
+
+    def check(self, shape, dtype, place, check_grad=True):
+        paddle.disable_static()
+        dense_data_np, dense_mask_np = generate_data(shape, dtype)
+
+        dense_data_pd = paddle.to_tensor(
+            dense_data_np, dtype=dtype, place=place
+        )
+        dense_data_pd.stop_gradient = False
+
+        if self.format == 'coo':
+            sparse_mask_pd = paddle.to_tensor(
+                dense_mask_np, dtype=dtype, place=place
+            ).to_sparse_coo(len(shape))
+        else:
+            sparse_mask_pd = paddle.to_tensor(
+                dense_mask_np, dtype=dtype, place=place
+            ).to_sparse_csr()
+
+        sparse_out_pd = paddle.sparse.mask_as(dense_data_pd, sparse_mask_pd)
+
+        # compare the tensor from sparse->dense with reference numpy data
+        # the result only keeps the values where mask not zero, like:
+        # dense_data_np
+        # [[ 38.  15.  76.]
+        #  [-98. -75.  10.]
+        #  [-52.  49. -48.]]
+        # dense_mask_np
+        # [[-70.   0.   0.]
+        #  [-50.  34.  60.]
+        #  [-34.   0. -18.]]
+        # dense_data_np_ref
+        # [[ 38.   0.   0.]
+        #  [-98. -75.  10.]
+        #  [-52.   0. -48.]]
+        dense_data_np_ref = dense_data_np * (dense_mask_np != 0)
+        np.testing.assert_allclose(
+            sparse_out_pd.to_dense().numpy(), dense_data_np_ref
+        )
+
+        if check_grad:
+            # with sparse_out_pd backward, we get the grad from dense_data_pd
+            sparse_out_pd.backward()
+            dense_data_grad = dense_data_pd.grad
+
+            self.assertEqual(
+                list(dense_data_grad.shape), list(dense_data_pd.shape)
+            )
+            self.assertEqual(dense_data_grad.dtype, dense_data_pd.dtype)
+
+            # make a dense data to compare the grad from sparse_out_pd
+            grad_ref = np.ones_like(dense_mask_np) * (dense_mask_np != 0)
+
+            np.testing.assert_allclose(
+                dense_data_pd.grad.numpy(),
+                grad_ref,
+            )
+
+    def check_with_dtypes(self, shape):
+        for place in self.places:
+            self.check(shape, 'float32', place)
+            self.check(shape, 'float64', place)
+            self.check(shape, 'int32', place)
+            self.check(shape, 'int64', place)
+            self.check(shape, 'complex64', place)
+            self.check(shape, 'complex128', place)
+
+            # `int8`` not registered in `FullLikeCooKernel`, so skip check_grad
+            self.check(shape, 'int8', place, check_grad=False)
+
+            # `int16` not registered in `multiply`, so skip check_grad
+            self.check(shape, 'int16', place, check_grad=False)
+
+        if paddle.is_compiled_with_cuda():
+            place = paddle.CUDAPlace(0)
+            self.check(shape, 'float16', place)
+
+
+class TestMaskAsCoo(TestMaskAs):
+    def init_format(self):
+        self.format = 'coo'
+
+    def test_1d(self):
+        self.check_with_dtypes((5,))
+
+    def test_2d(self):
+        self.check_with_dtypes((5, 3))
+
+    def test_3d(self):
+        self.check_with_dtypes((5, 3, 4))
+
+    def test_4d(self):
+        self.check_with_dtypes((5, 3, 4, 2))
+
+
+class TestMaskAsCsr(TestMaskAs):
+    def init_format(self):
+        self.format = 'csr'
+
+    def test_2d(self):
+        self.check_with_dtypes((5, 3))
+
+    def test_3d(self):
+        self.check_with_dtypes((5, 3, 4))
+
+    def test_error_dimension(self):
+        # error 1d
+        with self.assertRaises(ValueError):
+            self.check_with_dtypes((5,))
+
+        # error 4d
+        with self.assertRaises(ValueError):
+            self.check_with_dtypes((5, 3, 4, 2))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_spectral_norm_op.py b/test/legacy_test/test_spectral_norm_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_spectral_norm_op.py
rename to test/legacy_test/test_spectral_norm_op.py
diff --git a/test/deprecated/legacy_test/test_split_op.py b/test/legacy_test/test_split_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_split_op.py
rename to test/legacy_test/test_split_op.py
diff --git a/test/deprecated/legacy_test/test_static_save_load_large.py b/test/legacy_test/test_static_save_load_large.py
similarity index 100%
rename from test/deprecated/legacy_test/test_static_save_load_large.py
rename to test/legacy_test/test_static_save_load_large.py
diff --git a/test/deprecated/legacy_test/test_stft_op.py b/test/legacy_test/test_stft_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_stft_op.py
rename to test/legacy_test/test_stft_op.py
diff --git a/test/deprecated/legacy_test/test_svd_op.py b/test/legacy_test/test_svd_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_svd_op.py
rename to test/legacy_test/test_svd_op.py
diff --git a/test/deprecated/legacy_test/test_swiglu.py b/test/legacy_test/test_swiglu.py
similarity index 100%
rename from test/deprecated/legacy_test/test_swiglu.py
rename to test/legacy_test/test_swiglu.py
diff --git a/test/deprecated/legacy_test/test_temporal_shift_op.py b/test/legacy_test/test_temporal_shift_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_temporal_shift_op.py
rename to test/legacy_test/test_temporal_shift_op.py
diff --git a/test/deprecated/legacy_test/test_top_k_op.py b/test/legacy_test/test_top_k_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_top_k_op.py
rename to test/legacy_test/test_top_k_op.py
diff --git a/test/deprecated/legacy_test/test_top_k_v2_op.py b/test/legacy_test/test_top_k_v2_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_top_k_v2_op.py
rename to test/legacy_test/test_top_k_v2_op.py
diff --git a/test/deprecated/legacy_test/test_trace_op.py b/test/legacy_test/test_trace_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_trace_op.py
rename to test/legacy_test/test_trace_op.py
diff --git a/test/deprecated/legacy_test/test_triangular_solve_op.py b/test/legacy_test/test_triangular_solve_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_triangular_solve_op.py
rename to test/legacy_test/test_triangular_solve_op.py
diff --git a/test/deprecated/legacy_test/test_trilinear_interp_op.py b/test/legacy_test/test_trilinear_interp_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_trilinear_interp_op.py
rename to test/legacy_test/test_trilinear_interp_op.py
diff --git a/test/deprecated/legacy_test/test_trilinear_interp_v2_op.py b/test/legacy_test/test_trilinear_interp_v2_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_trilinear_interp_v2_op.py
rename to test/legacy_test/test_trilinear_interp_v2_op.py
diff --git a/test/deprecated/legacy_test/test_trunc_op.py b/test/legacy_test/test_trunc_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_trunc_op.py
rename to test/legacy_test/test_trunc_op.py
diff --git a/test/deprecated/legacy_test/test_unfold_op.py b/test/legacy_test/test_unfold_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_unfold_op.py
rename to test/legacy_test/test_unfold_op.py
diff --git a/test/deprecated/legacy_test/test_unique_consecutive_op.py b/test/legacy_test/test_unique_consecutive_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_unique_consecutive_op.py
rename to test/legacy_test/test_unique_consecutive_op.py
diff --git a/test/deprecated/legacy_test/test_unpool3d_op.py b/test/legacy_test/test_unpool3d_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_unpool3d_op.py
rename to test/legacy_test/test_unpool3d_op.py
diff --git a/test/deprecated/legacy_test/test_unpool_op.py b/test/legacy_test/test_unpool_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_unpool_op.py
rename to test/legacy_test/test_unpool_op.py
diff --git a/test/deprecated/legacy_test/test_unstack_op.py b/test/legacy_test/test_unstack_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_unstack_op.py
rename to test/legacy_test/test_unstack_op.py
diff --git a/test/deprecated/legacy_test/test_yolov3_loss_op.py b/test/legacy_test/test_yolov3_loss_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_yolov3_loss_op.py
rename to test/legacy_test/test_yolov3_loss_op.py
diff --git a/test/mkldnn/test_batch_norm_mkldnn_op.py b/test/mkldnn/test_batch_norm_mkldnn_op.py
index 99f48c65b0a4e..490021ecee6d9 100644
--- a/test/mkldnn/test_batch_norm_mkldnn_op.py
+++ b/test/mkldnn/test_batch_norm_mkldnn_op.py
@@ -20,8 +20,8 @@
 from op_test import _set_use_system_allocator, pir_executor_guard
 
 sys.path.append("../deprecated/legacy_test")
-from test_batch_norm_op import (
-    TestBatchNormOpInference,
+from test_batch_norm_op import TestBatchNormOpInference
+from test_batch_norm_op_deprecated import (
     TestBatchNormOpTraining,
     _reference_grad,
     _reference_training,
diff --git a/test/prim/pir_prim/CMakeLists.txt b/test/prim/pir_prim/CMakeLists.txt
index 108cc3b8b28da..c2bb0610a60d6 100644
--- a/test/prim/pir_prim/CMakeLists.txt
+++ b/test/prim/pir_prim/CMakeLists.txt
@@ -12,7 +12,9 @@ set(TEST_PRIM_PURE_PIR_CASES
     test_auto_recompute
     test_auto_recompute_dy2static
     test_prim_sub_graph_dynamic_shape
-    test_decompose_control_flow)
+    test_prim_sub_graph_backward_dynamic_shape
+    test_decompose_control_flow
+    test_decomp_whole_program)
 
 foreach(target ${TEST_PRIM_PURE_PIR_CASES})
   py_test_modules(
@@ -52,6 +54,7 @@ if(WITH_CINN)
       FLAGS_prim_check_ops=true
       FLAGS_enable_pir_api=true
       FLAGS_prim_enable_dynamic=true
+      FLAGS_prim_vjp_skip_default_ops=false
       FLAGS_cinn_bucket_compile=True
       FLAGS_pir_apply_shape_optimization_pass=1)
     set_tests_properties(${target} PROPERTIES LABELS "RUN_TYPE=CINN")
diff --git a/test/prim/pir_prim/test_decomp_whole_program.py b/test/prim/pir_prim/test_decomp_whole_program.py
index f8c58ef7c2469..7d0b28edf5dad 100644
--- a/test/prim/pir_prim/test_decomp_whole_program.py
+++ b/test/prim/pir_prim/test_decomp_whole_program.py
@@ -40,7 +40,8 @@ def base_net(self, flag=None):
             y.stop_gradient = False
             x1 = paddle.sin(x)
             y1 = paddle.cos(y)
-            tmp1 = paddle.matmul(x1, y1)
+            y3 = paddle.matmul(x1, y1)
+            tmp1 = paddle.concat((x1, y1, y3))
             tmp2 = paddle.mean(tmp1)
             sum_out = paddle.sin(tmp2)
             gradients = grad(sum_out, (x, y))
@@ -54,17 +55,18 @@ def base_net(self, flag=None):
 
         whole_ops = [op.name() for op in main_program.global_block().ops]
         if flag == "prim":
-            assert 'pd_op.matmul_grad' not in whole_ops
+            assert 'pd_op.concat_grad' not in whole_ops
         else:
-            assert 'pd_op.matmul_grad' in whole_ops
+            assert 'pd_op.concat_grad' in whole_ops
 
         return fwd, dx, dy
 
     def test_prim_all(self):
+        paddle.base.core._set_prim_backward_blacklist("sin_grad", "cos_grad")
         res_ref = self.base_net()
         res = self.base_net("prim")
         for ref, actual in zip(res_ref, res):
-            np.testing.assert_allclose(ref, actual, rtol=1e-6)
+            np.testing.assert_allclose(ref, actual, rtol=1e-6, atol=1e-6)
 
 
 if __name__ == "__main__":
diff --git a/test/prim/pir_prim/test_prim_sub_graph_backward_dynamic_shape.py b/test/prim/pir_prim/test_prim_sub_graph_backward_dynamic_shape.py
new file mode 100644
index 0000000000000..4567139ea3c34
--- /dev/null
+++ b/test/prim/pir_prim/test_prim_sub_graph_backward_dynamic_shape.py
@@ -0,0 +1,210 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle.framework import core
+from paddle.static import InputSpec
+
+
+def sum_net1(x):
+    return paddle.sum(x, axis=1, keepdim=False)
+
+
+def sum_net2(x):
+    return paddle.sum(x)
+
+
+def sum_net3(x):
+    return paddle.sum(x, keepdim=True)
+
+
+def sum_net4(x):
+    return paddle.sum(x, axis=-1, keepdim=False)
+
+
+def sum_net5(x):
+    return paddle.sum(x, axis=[0, 2], keepdim=False)
+
+
+def mean_net1(x):
+    return paddle.mean(x, axis=1, keepdim=False)
+
+
+def mean_net2(x):
+    return paddle.mean(x, axis=-1, keepdim=False)
+
+
+def mean_net3(x):
+    return paddle.mean(x, axis=[0, 2], keepdim=False)
+
+
+def apply_to_static(net, use_cinn, input_spec=None):
+    build_strategy = paddle.static.BuildStrategy()
+    build_strategy.build_cinn_pass = use_cinn
+    return paddle.jit.to_static(
+        net,
+        input_spec=input_spec,
+        build_strategy=build_strategy,
+        full_graph=True,
+    )
+
+
+class TestPrimBaseWithGrad(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(2023)
+        self.dtype = "float32"
+        self.x_shape = [30, 200, 40]
+        self.init_x_shape = [None, None, 40]
+        self.x = np.random.random(self.x_shape).astype(self.dtype)
+        self.net = sum_net1
+        self.enable_cinn = False
+        self.tol = 1e-6
+
+    def base_net(self, flag=None):
+        if flag == "prim":
+            core._set_prim_all_enabled(True)
+        x = paddle.to_tensor(self.x, stop_gradient=False)
+        if flag == "prim":
+            fn = apply_to_static(
+                self.net,
+                use_cinn=self.enable_cinn,
+                input_spec=[
+                    InputSpec(shape=self.init_x_shape, dtype='float32'),
+                ],
+            )
+            fn.train()
+        else:
+            fn = self.net
+        res = fn(x)
+        res.backward()
+        x_grad = x.gradient()
+        if flag == "prim":
+            core._set_prim_all_enabled(False)
+        return res, x_grad
+
+    def test_prim_all_dynamic(self):
+        res_ref, grad_ref = self.base_net()
+        res, grad = self.base_net("prim")
+
+        for ref, actual in zip(res_ref, res):
+            np.testing.assert_allclose(
+                ref, actual, rtol=self.tol, atol=self.tol
+            )
+
+        for dr, d in zip(grad_ref, grad):
+            np.testing.assert_allclose(dr, d, rtol=self.tol, atol=self.tol)
+
+
+class TestPrimSumWithGrad1(TestPrimBaseWithGrad):
+    def setUp(self):
+        np.random.seed(2023)
+        self.dtype = "float32"
+        self.x_shape = [1000]
+        self.init_x_shape = [None]
+        self.x = np.random.random(self.x_shape).astype(self.dtype)
+        self.net = sum_net2
+        self.enable_cinn = False
+        self.tol = 1e-6
+
+
+class TestPrimSumWithGrad2(TestPrimBaseWithGrad):
+    def setUp(self):
+        np.random.seed(2023)
+        self.dtype = "float32"
+        self.x_shape = [30, 200, 40]
+        self.init_x_shape = [None, None, 40]
+        self.x = np.random.random(self.x_shape).astype(self.dtype)
+        self.net = sum_net3
+        self.enable_cinn = False
+        self.tol = 1e-6
+
+
+class TestPrimSumWithGrad3(TestPrimBaseWithGrad):
+    def setUp(self):
+        np.random.seed(2023)
+        self.dtype = "float32"
+        self.x_shape = [30, 200, 40]
+        self.init_x_shape = [None, None, 40]
+        self.x = np.random.random(self.x_shape).astype(self.dtype)
+        self.net = sum_net2
+        self.enable_cinn = False
+        self.tol = 1e-6
+
+
+class TestPrimSumWithGrad4(TestPrimBaseWithGrad):
+    def setUp(self):
+        np.random.seed(2023)
+        self.dtype = "float32"
+        self.x_shape = [30, 200, 40]
+        self.init_x_shape = [None, None, 40]
+        self.x = np.random.random(self.x_shape).astype(self.dtype)
+        self.net = sum_net4
+        self.enable_cinn = False
+        self.tol = 1e-6
+
+
+class TestPrimSumWithGrad5(TestPrimBaseWithGrad):
+    def setUp(self):
+        np.random.seed(2023)
+        self.dtype = "float32"
+        self.x_shape = [30, 200, 40]
+        self.init_x_shape = [None, None, 40]
+        self.x = np.random.random(self.x_shape).astype(self.dtype)
+        self.net = sum_net5
+        self.enable_cinn = False
+        self.tol = 1e-6
+
+
+class TestPrimMeanWithGrad(TestPrimBaseWithGrad):
+    def setUp(self):
+        np.random.seed(2023)
+        self.dtype = "float32"
+        self.x_shape = [30, 200, 40]
+        self.init_x_shape = [None, None, 40]
+        self.x = np.random.random(self.x_shape).astype(self.dtype)
+        self.net = mean_net1
+        self.enable_cinn = False
+        self.tol = 1e-6
+
+
+class TestPrimMeanWithGrad2(TestPrimBaseWithGrad):
+    def setUp(self):
+        np.random.seed(2023)
+        self.dtype = "float32"
+        self.x_shape = [30, 200, 40]
+        self.init_x_shape = [None, None, 40]
+        self.x = np.random.random(self.x_shape).astype(self.dtype)
+        self.net = mean_net2
+        self.enable_cinn = False
+        self.tol = 1e-6
+
+
+class TestPrimMeanWithGrad3(TestPrimBaseWithGrad):
+    def setUp(self):
+        np.random.seed(2023)
+        self.dtype = "float32"
+        self.x_shape = [30, 200, 40]
+        self.init_x_shape = [None, None, 40]
+        self.x = np.random.random(self.x_shape).astype(self.dtype)
+        self.net = mean_net3
+        self.enable_cinn = False
+        self.tol = 1e-6
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/ps/download_criteo_data.sh b/test/ps/download_criteo_data.sh
index 69bfd90bee050..911ba59f34d5e 100755
--- a/test/ps/download_criteo_data.sh
+++ b/test/ps/download_criteo_data.sh
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/test/ps/download_data.sh b/test/ps/download_data.sh
index 498d9df9c2b4a..8feb69bcb9407 100755
--- a/test/ps/download_data.sh
+++ b/test/ps/download_data.sh
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/test/ps/gpubox_run.sh b/test/ps/gpubox_run.sh
index a38a4498ee4c0..27316171ec667 100644
--- a/test/ps/gpubox_run.sh
+++ b/test/ps/gpubox_run.sh
@@ -1,13 +1,13 @@
 # !/bin/bash
 
 # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/test/quantization/CMakeLists.txt b/test/quantization/CMakeLists.txt
index e18f8c0a38096..5b37f83e0c28f 100644
--- a/test/quantization/CMakeLists.txt
+++ b/test/quantization/CMakeLists.txt
@@ -461,16 +461,9 @@ list(REMOVE_ITEM TEST_OPS test_filter_pruning)
 # fix
 if(WIN32)
   set(SINGLE_CARD_TEST_OPS
-      test_user_defined_quantization
-      test_quantization_scale_pass
-      test_quantization_pass
-      test_moving_average_abs_max_scale_op
-      test_imperative_qat_channelwise
-      test_imperative_qat
-      test_imperative_qat_lsq
-      test_imperative_qat_matmul
-      test_imperative_out_scale
-      test_graph)
+      test_imperative_qat_channelwise test_imperative_qat
+      test_imperative_qat_lsq test_imperative_qat_matmul
+      test_imperative_out_scale)
   list(REMOVE_ITEM TEST_OPS ${SINGLE_CARD_TEST_OPS})
   foreach(src ${SINGLE_CARD_TEST_OPS})
     py_test(${src} SRCS ${src}.py ENVS CUDA_VISIBLE_DEVICES=0)
diff --git a/test/quantization/test_imperative_qat_lsq.py b/test/quantization/test_imperative_qat_lsq.py
index c71bd02c56bbc..bd16d309b249c 100644
--- a/test/quantization/test_imperative_qat_lsq.py
+++ b/test/quantization/test_imperative_qat_lsq.py
@@ -213,7 +213,7 @@ def func_qat(self):
             print('eval_acc_top1', eval_acc_top1)
         self.assertTrue(
             eval_acc_top1 > 0.9,
-            msg="The test acc {%f} is less than 0.9." % eval_acc_top1,
+            msg=f"The test acc {{{eval_acc_top1:f}}} is less than 0.9.",
         )
 
     def test_qat(self):
diff --git a/test/sot/test_builtin_map.py b/test/sot/test_builtin_map.py
index f005ec10cdbe4..bad6206f3b3bc 100644
--- a/test/sot/test_builtin_map.py
+++ b/test/sot/test_builtin_map.py
@@ -24,11 +24,11 @@
 from paddle.jit.sot.utils import strict_mode_guard
 
 
-def double_num(num: float | int):
+def double_num(num: float):
     return num * 2
 
 
-def double_num_with_breakgraph(num: float | int):
+def double_num_with_breakgraph(num: float):
     sot.psdb.breakgraph()
     return num * 2
 
diff --git a/test/sot/test_sot_dynamic_shape.py b/test/sot/test_sot_dynamic_shape.py
index ceed37d64438a..12608d1c871e4 100644
--- a/test/sot/test_sot_dynamic_shape.py
+++ b/test/sot/test_sot_dynamic_shape.py
@@ -25,7 +25,7 @@
 from paddle.jit.sot.utils import with_allow_dynamic_shape_guard
 
 
-def foo(x):
+def dynamic_shape_input_func1(x):
     s = x.shape[0]
     return x + s
 
@@ -85,6 +85,20 @@ def test_dynamic_int_input_cache_hit_case3(self):
                 )
                 self.assertEqual(ctx.translate_count, i + 1)
 
+    def test_dynamic_shape_input_cache_hit_case1(self):
+        with with_allow_dynamic_shape_guard(
+            True
+        ), test_instruction_translator_cache_context() as ctx:
+            self.assert_results(
+                dynamic_shape_input_func1, paddle.randn([1, 4, 5])
+            )
+            self.assertEqual(ctx.translate_count, 1)
+            for i in range(2, 6):
+                self.assert_results(
+                    dynamic_shape_input_func1, paddle.randn([i, 4, 5])
+                )
+                self.assertEqual(ctx.translate_count, 2)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/standalone_executor/test_standalone_measure_real_op_cost.py b/test/standalone_executor/test_standalone_measure_real_op_cost.py
index 9825e16e91ee6..8ee254a427d8e 100644
--- a/test/standalone_executor/test_standalone_measure_real_op_cost.py
+++ b/test/standalone_executor/test_standalone_measure_real_op_cost.py
@@ -112,7 +112,7 @@ def _run_op_profiling(self, place, run_profiling=True):
         return loss_data
 
     def _compare_loss_between(self, loss_run1, loss_run2):
-        s1, s2 = '%.6f' % loss_run1, '%.6f' % loss_run2
+        s1, s2 = f'{loss_run1:.6f}', f'{loss_run2:.6f}'
         return s1 == s2
 
     def test_op_profiling_cuda0(self):
diff --git a/test/white_list/op_threshold_white_list.py b/test/white_list/op_threshold_white_list.py
index 518980dec7de7..9809105815577 100644
--- a/test/white_list/op_threshold_white_list.py
+++ b/test/white_list/op_threshold_white_list.py
@@ -33,6 +33,7 @@
     'fractional_max_pool3d',
     'norm',
     'pool3d',
+    'lp_pool2d',
     'reduce_prod',
     'selu',
     'sigmoid_cross_entropy_with_logits',
diff --git a/test/white_list/pir_op_test_white_list b/test/white_list/pir_op_test_white_list
index 2daa7ddd497e4..99eb57ea4a17b 100644
--- a/test/white_list/pir_op_test_white_list
+++ b/test/white_list/pir_op_test_white_list
@@ -142,7 +142,7 @@ test_i0_op
 test_i0e_op
 test_i1_op
 test_i1e_op
-test_imperative_lod_tensor_to_selected_rows
+test_imperative_lod_tensor_to_selected_rows_deprecated
 test_index_add_op
 test_index_sample_op
 test_index_select_op
diff --git a/test/xpu/test_block_multihead_attention_op_xpu.py b/test/xpu/test_block_multihead_attention_op_xpu.py
new file mode 100644
index 0000000000000..6a898bbf8f26e
--- /dev/null
+++ b/test/xpu/test_block_multihead_attention_op_xpu.py
@@ -0,0 +1,581 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle.incubate.nn.functional import block_multihead_attention_xpu
+
+paddle.seed(2023)
+np.random.seed(2023)
+
+
+def create_attn_mask(
+    mask_type,
+    batch_size,
+    seq_lens,
+    pre_cache_length=0,
+):
+    max_seq_len = max(seq_lens)
+    mask = paddle.zeros(
+        [batch_size, 1, max_seq_len, max_seq_len + pre_cache_length],
+        dtype=mask_type,
+    )
+    mask[:, :, :, :pre_cache_length] = 1
+    for i in range(batch_size):
+        seq_len = seq_lens[i]
+        mask[i, 0, :seq_len, :seq_len] = (
+            paddle.tril(paddle.ones(shape=(seq_len, seq_len), dtype=mask_type))
+            - 1
+        ) * 1e4
+    return mask
+
+
+def naive_attention_impl(
+    query,
+    key,
+    value,
+    cache_k=None,
+    cache_v=None,
+    pre_cache_k=None,
+    pre_cache_v=None,
+    mask=None,
+    scale=1.0,
+    cache_k_dequant_scales=None,
+    cache_v_dequant_scales=None,
+    use_cachekv_int8="None",
+):
+    batch = query.shape[0]
+    heads = query.shape[1]
+    seq_len = query.shape[2]
+    head_dim = query.shape[3]
+    kv_head = key.shape[1]
+
+    key = key.reshape([batch, kv_head, 1, seq_len, head_dim])
+    key = paddle.tile(key, [1, 1, heads // kv_head, 1, 1])
+    key = key.reshape([batch, heads, seq_len, head_dim])
+
+    if use_cachekv_int8 == "dynamic":
+        unsqueeze_shape = [2, 3]
+    elif use_cachekv_int8 == "static":
+        unsqueeze_shape = [0, 2, 3]
+    if pre_cache_k is not None:
+        key = paddle.concat([pre_cache_k, key], axis=2)
+    if cache_k is not None:
+        if cache_k_dequant_scales is not None:
+            dequant_cache_k = (
+                (cache_k.astype('float32') - 128.0)
+                * cache_k_dequant_scales.unsqueeze(unsqueeze_shape)
+            ).astype(key.dtype)
+            key = paddle.concat([dequant_cache_k, key], axis=2)
+        else:
+            key = paddle.concat([cache_k, key], axis=2)
+
+    value = value.reshape([batch, kv_head, 1, seq_len, head_dim])
+    value = paddle.tile(value, [1, 1, heads // kv_head, 1, 1])
+    value = value.reshape([batch, heads, seq_len, head_dim])
+    if pre_cache_v is not None:
+        value = paddle.concat([pre_cache_v, value], axis=2)
+    if cache_v is not None:
+        if cache_v_dequant_scales is not None:
+            dequant_cache_v = (
+                (cache_v.astype('float32') - 128.0)
+                * cache_v_dequant_scales.unsqueeze(unsqueeze_shape)
+            ).astype(value.dtype)
+            value = paddle.concat([dequant_cache_v, value], axis=2)
+        else:
+            value = paddle.concat([cache_v, value], axis=2)
+    qk_res = paddle.matmul(query, key, transpose_y=True)
+    attention = qk_res * scale
+    if mask is not None:
+        attention = attention + mask
+    softmax_result = paddle.nn.functional.softmax(attention, -1)
+    result = paddle.matmul(softmax_result, value)
+    return result
+
+
+def get_padding_offset(bsz, max_seq_len, seq_lens_this_time):
+    cum_offsets_now = paddle.cumsum(max_seq_len - seq_lens_this_time)
+    cum_offsets = paddle.zeros(shape=(bsz + 1), dtype="int32")
+    cum_offsets[1:] = cum_offsets_now
+    token_num = paddle.sum(seq_lens_this_time)
+    padding_offsets = paddle.zeros(shape=(token_num), dtype="int32")
+    cu_seqlens_q = paddle.zeros(shape=(bsz + 1), dtype="int32")
+    cu_seqlens_k = paddle.zeros(shape=(bsz + 1), dtype="int32")
+    for i in range(bsz):
+        seq_len_now = seq_lens_this_time[i]
+        cum_offset = cum_offsets[i]
+        for j in range(seq_len_now):
+            padding_offsets[i * max_seq_len - cum_offset + j] = cum_offset
+        cum_seq_len = (i + 1) * max_seq_len - cum_offsets[i + 1]
+        cu_seqlens_q[i + 1] = cum_seq_len
+        cu_seqlens_k[i + 1] = cum_seq_len
+    return padding_offsets, cum_offsets[:-1], cu_seqlens_q, cu_seqlens_k
+
+
+class RopeEmbedding:
+    def _rotary_position_embedding(self, seq_len, head_dim, dtype):
+        pos_seq = paddle.arange(0, seq_len, 1, dtype=dtype)
+        indices = paddle.arange(0, head_dim, 2, dtype=dtype)
+        indices = 1 / 10000 ** (indices / head_dim)
+
+        sinusoid_inp = pos_seq.unsqueeze(1) * indices.unsqueeze(0)
+        pos_emb = paddle.concat(
+            [paddle.sin(sinusoid_inp), paddle.cos(sinusoid_inp)], axis=-1
+        )
+        pos_emb = paddle.reshape(pos_emb, (1, 1, seq_len, head_dim))
+        pos_emb.stop_gradient = True
+        return pos_emb
+
+    def _apply_rope(self, rp, q, k, v=None):
+        # sin [sequence_length, embed_size_per_head//2]
+        # cos [sequence_length, embed_size_per_head//2]
+        sin, cos = paddle.chunk(rp, 2, axis=-1)
+        # sin [θ0,θ1,θ2......θd/2-1] -> sin_pos [θ0,θ0,θ1,θ1,θ2,θ2......θd/2-1,θd/2-1]
+        sin_pos = paddle.reshape(paddle.stack([sin, sin], axis=-1), rp.shape)
+        # cos [θ0,θ1,θ2......θd/2-1] -> cos_pos [θ0,θ0,θ1,θ1,θ2,θ2......θd/2-1,θd/2-1]
+        cos_pos = paddle.reshape(paddle.stack([cos, cos], axis=-1), rp.shape)
+        # rotate_half_query_layer [-q1,q0,-q3,q2......,-qd-1,qd-2]
+        rotate_half_q = paddle.reshape(
+            paddle.stack([-q[:, :, :, 1::2], q[:, :, :, 0::2]], axis=-1),
+            paddle.shape(q),
+        )
+        query = paddle.add(
+            paddle.multiply(q, cos_pos), paddle.multiply(rotate_half_q, sin_pos)
+        )
+        # rotate_half_key_layer [-k1,k0,-k3,k2......,-kd-1,kd-2]
+        rotate_half_k = paddle.reshape(
+            paddle.stack([-k[:, :, :, 1::2], k[:, :, :, 0::2]], axis=-1),
+            paddle.shape(k),
+        )
+        key = paddle.add(
+            paddle.multiply(k, cos_pos), paddle.multiply(rotate_half_k, sin_pos)
+        )
+        if v is not None:
+            # rotate_half_value_layer [-v1,v0,-v3,v2......,-vd-1,vd-2]
+            rotate_half_v = paddle.reshape(
+                paddle.stack([-v[:, :, :, 1::2], v[:, :, :, 0::2]], axis=-1),
+                paddle.shape(v),
+            )
+            value = paddle.add(
+                paddle.multiply(v, cos_pos),
+                paddle.multiply(rotate_half_v, sin_pos),
+            )
+            return query, key, value
+        return query, key
+
+    def _apply_neox_rope(self, rp, q, k, v=None):
+        # sin [bs, sequence_length, embed_size_per_head//2]
+        # cos [bs, sequence_length, embed_size_per_head//2]
+        sin, cos = paddle.chunk(rp, 2, axis=-1)
+
+        # sin [θ0,θ1,θ2......θd/2-1] -> sin_pos [θ0,θ1,θ2......θd/2-1, θ0,θ1,θ2......θd/2-1]
+        sin_pos = paddle.concat([sin, sin], axis=-1).squeeze(0).unsqueeze(1)
+        # cos [θ0,θ1,θ2......θd/2-1] -> cos_pos [θ0,θ1,θ2......θd/2-1, θ0,θ1,θ2......θd/2-1]
+        cos_pos = paddle.concat([cos, cos], axis=-1).squeeze(0).unsqueeze(1)
+        rotate_half_q = paddle.reshape(
+            paddle.concat(
+                [-q[:, :, :, sin.shape[-1] :], q[:, :, :, 0 : sin.shape[-1]]],
+                axis=-1,
+            ),
+            paddle.shape(q),
+        )
+        query = paddle.add(
+            paddle.multiply(q, cos_pos), paddle.multiply(rotate_half_q, sin_pos)
+        )
+        rotate_half_k = paddle.reshape(
+            paddle.concat(
+                [-k[:, :, :, sin.shape[-1] :], k[:, :, :, 0 : sin.shape[-1]]],
+                axis=-1,
+            ),
+            paddle.shape(k),
+        )
+        key = paddle.add(
+            paddle.multiply(k, cos_pos), paddle.multiply(rotate_half_k, sin_pos)
+        )
+        if v is not None:
+            rotate_half_v = paddle.reshape(
+                paddle.concat(
+                    [
+                        -v[:, :, :, sin.shape[-1] :],
+                        v[:, :, :, 0 : sin.shape[-1]],
+                    ],
+                    axis=-1,
+                ),
+                paddle.shape(v),
+            )
+            value = paddle.add(
+                paddle.multiply(v, cos_pos),
+                paddle.multiply(rotate_half_v, sin_pos),
+            )
+            return query, key, value
+        return query, key
+
+
+def remove_padding(seq_lens, cu_seq_lens, inputs, token_num):
+    bsz, num_head, seq_len, dim_head = inputs.shape
+    output = paddle.zeros(
+        shape=[token_num, num_head * dim_head], dtype=inputs.dtype
+    )
+    inputs = inputs.transpose([0, 2, 1, 3]).reshape([bsz, seq_len, -1])
+    for i in range(bsz):
+        seq_len_now = seq_lens[i]
+        start_idx = cu_seq_lens[i]
+        end_idx = cu_seq_lens[i + 1]
+        output[start_idx:end_idx, :] = inputs[i, :seq_len_now, :]
+    return output
+
+
+def block_cache_to_naive_cache(
+    cache_k, cache_v, bsz, block_tables, cache_seq_len
+):
+    _, num_head, blocksize, dim_head = cache_k.shape
+    out_cache_k = paddle.zeros(
+        shape=[bsz, num_head, cache_seq_len, dim_head], dtype=cache_k.dtype
+    )
+    out_cache_v = paddle.zeros(
+        shape=[bsz, num_head, cache_seq_len, dim_head], dtype=cache_v.dtype
+    )
+    for i in range(bsz):
+        for j in range(cache_seq_len):
+            out_cache_k[i, :, j, :] = cache_k[
+                block_tables[i, j // blocksize], :, j % blocksize, :
+            ]
+            out_cache_v[i, :, j, :] = cache_v[
+                block_tables[i, j // blocksize], :, j % blocksize, :
+            ]
+    return out_cache_k, out_cache_v
+
+
+class TestBlockMultiHeadAttnRoPEXPU(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+        self.name = "TestBlockMultiHeadAttnRoPE"
+        self.place = paddle.XPUPlace(0)
+        self.batch_size = 2
+        self.num_head = 8
+        self.seq_len = 64
+        self.max_dec_len = 64
+        self.dim_head = 64
+        self.hid_dim = self.num_head * self.dim_head
+        self.blocksize = 64
+        self.block_num_per_seq = (
+            self.seq_len + self.max_dec_len + self.blocksize - 1
+        ) // self.blocksize
+        self.rope = RopeEmbedding()
+        self.max_block_num = self.block_num_per_seq * self.batch_size
+        self.free_list = list(range(self.max_block_num - 1, -1, -1))
+        self.seq_lens_encoder = paddle.to_tensor(
+            [
+                self.seq_len,
+            ]
+            * self.batch_size,
+            "int32",
+        )
+        self.seq_lens_decoder = paddle.to_tensor(
+            [
+                0,
+            ]
+            * self.batch_size,
+            "int32",
+        )
+        self.seq_lens_this_time = paddle.to_tensor(
+            [
+                self.seq_len,
+            ]
+            * self.batch_size,
+            "int32",
+        )
+        self.shape = (
+            self.batch_size,
+            self.num_head,
+            self.seq_len,
+            self.dim_head,
+        )
+        self.cache_shape = (
+            self.max_block_num,
+            self.num_head,
+            self.blocksize,
+            self.dim_head,
+        )
+        self.dtype = 'float16'
+        self.attention_mask = create_attn_mask(
+            self.dtype,
+            self.batch_size,
+            [
+                self.seq_len,
+            ]
+            * self.batch_size,
+        )
+        self.scale = 1.0 / np.sqrt(self.shape[-1])
+        self.cache_k = paddle.zeros(shape=self.cache_shape, dtype=self.dtype)
+        self.cache_v = paddle.zeros(shape=self.cache_shape, dtype=self.dtype)
+        self.block_tables = paddle.zeros(
+            shape=(self.batch_size, self.block_num_per_seq), dtype="int32"
+        )
+        self.cache_k_per_batch_maxs = paddle.zeros(
+            [self.batch_size, 6], dtype="float32"
+        )
+        self.cache_v_per_batch_maxs = paddle.zeros(
+            [self.batch_size, 6], dtype="float32"
+        )
+        for i in range(self.batch_size):
+            need_block_num = (
+                self.seq_len + self.max_dec_len + self.blocksize - 1
+            ) // self.blocksize
+            for j in range(need_block_num):
+                self.block_tables[i, j] = self.free_list.pop()
+        (
+            self.padding_offset,
+            self.cum_offset,
+            self.cu_seqlens_q,
+            self.cu_seqlens_k,
+        ) = get_padding_offset(
+            self.batch_size, self.seq_len, self.seq_lens_this_time
+        )
+        self.token_num = self.padding_offset.shape[0]
+
+    def get_rotary_position_embedding(self, position_ids, head_dim):
+        bsz, max_seq_len = position_ids.shape[:2]
+        rot_emb = paddle.zeros(
+            (2, bsz, max_seq_len, 1, head_dim), dtype="float32"
+        )
+        inv_freq = 10000 ** (
+            -paddle.arange(0, head_dim, 2, dtype="float32") / head_dim
+        )
+
+        # shape: [B, S, D/2]
+        freqs = paddle.einsum(
+            "ij,k->ijk", position_ids.cast("float32"), inv_freq
+        )
+        # shape: [B, S, D]
+        emb = paddle.concat([freqs, freqs], axis=-1).reshape(
+            (bsz, max_seq_len, head_dim)
+        )
+        # emb = paddle.stack([freqs], axis=-1).reshape(
+        #     (bsz, max_seq_len, head_dim // 2)
+        # )
+        # shape: [B, S, 1, D]
+        emb = paddle.unsqueeze(emb, 2)
+
+        rot_emb[0] = paddle.cos(emb)
+        rot_emb[1] = paddle.sin(emb)
+        return rot_emb
+
+    def test_all(self):
+        paddle.disable_static()
+        tmp_position_ids = paddle.arange(
+            self.seq_len + self.max_dec_len
+        ).reshape((1, -1))
+        self.rope_emb = self.get_rotary_position_embedding(
+            tmp_position_ids, self.dim_head
+        )
+        # encoder
+        query = np.random.uniform(-1, 1, self.shape)
+        q = paddle.to_tensor(
+            query, place=self.place, dtype=self.dtype, stop_gradient=False
+        )
+        key = np.random.uniform(-1, 1, self.shape)
+        k = paddle.to_tensor(
+            key, place=self.place, dtype=self.dtype, stop_gradient=False
+        )
+        value = np.random.uniform(-1, 1, self.shape)
+        v = paddle.to_tensor(
+            value, place=self.place, dtype=self.dtype, stop_gradient=False
+        )
+        qkv = paddle.stack(
+            [
+                q.transpose([0, 2, 1, 3]).reshape(
+                    [self.token_num, self.hid_dim]
+                ),
+                k.transpose([0, 2, 1, 3]).reshape(
+                    [self.token_num, self.hid_dim]
+                ),
+                v.transpose([0, 2, 1, 3]).reshape(
+                    [self.token_num, self.hid_dim]
+                ),
+            ],
+            axis=1,
+        ).reshape([self.token_num, -1])
+        sinusoidal_pos = self.rope._rotary_position_embedding(
+            self.seq_len, self.dim_head, "float32"
+        )
+        q, k = self.rope._apply_neox_rope(
+            sinusoidal_pos.astype("float16"), q, k
+        )
+
+        out_ = naive_attention_impl(
+            q, k, v, None, None, None, None, self.attention_mask, self.scale
+        )
+        out_ = remove_padding(
+            self.seq_lens_this_time, self.cu_seqlens_q, out_, self.token_num
+        )
+        out = block_multihead_attention_xpu(
+            qkv,
+            self.cache_k,
+            self.cache_v,
+            self.seq_lens_encoder,
+            self.seq_lens_decoder,
+            self.seq_lens_this_time,
+            self.padding_offset,
+            self.cum_offset,
+            self.cu_seqlens_q,
+            self.cu_seqlens_k,
+            self.block_tables,
+            self.cache_k_per_batch_maxs,
+            self.cache_v_per_batch_maxs,
+            None,  # pre_key_cache
+            None,  # pre_value_cache
+            None,  # cache_k_quant_scales
+            None,  # cache_v_quant_scales
+            None,  # cache_k_dequant_scales
+            None,  # cache_v_dequant_scales
+            None,  # qkv_out_scale
+            None,  # qkv_bias
+            None,  # out_shift
+            None,  # out_smooth
+            None,  # max_enc_len_this_time
+            None,  # max_dec_len_this_time
+            self.rope_emb,  # rotary_embs
+            None,  # attn_mask
+            None,  # tgt_mask
+            self.seq_len,
+            self.blocksize,
+            True,  # use_neox_rotary_style
+        )[0]
+        np.testing.assert_allclose(
+            out.numpy(),
+            out_.numpy(),
+            rtol=5e-02,
+            atol=1e-03,
+        )
+        # decoder
+        naive_cache_k, naive_cache_v = block_cache_to_naive_cache(
+            self.cache_k,
+            self.cache_v,
+            self.batch_size,
+            self.block_tables,
+            self.seq_len,
+        )
+
+        self.seq_lens_decoder = self.seq_lens_encoder.clone()
+        self.seq_lens_encoder[:] = paddle.zeros_like(self.seq_lens_encoder)
+        self.seq_lens_this_time[:] = 1
+        self.shape = (
+            self.batch_size,
+            self.num_head,
+            1,
+            self.dim_head,
+        )
+        query = np.random.uniform(-1, 1, self.shape)
+        q = paddle.to_tensor(
+            query, place=self.place, dtype=self.dtype, stop_gradient=False
+        )
+        key = np.random.uniform(-1, 1, self.shape)
+        k = paddle.to_tensor(
+            key, place=self.place, dtype=self.dtype, stop_gradient=False
+        )
+        value = np.random.uniform(-1, 1, self.shape)
+        v = paddle.to_tensor(
+            value, place=self.place, dtype=self.dtype, stop_gradient=False
+        )
+
+        qkv = paddle.stack(
+            [
+                q.transpose([0, 2, 1, 3]).reshape(
+                    [self.batch_size, self.hid_dim]
+                ),
+                k.transpose([0, 2, 1, 3]).reshape(
+                    [self.batch_size, self.hid_dim]
+                ),
+                v.transpose([0, 2, 1, 3]).reshape(
+                    [self.batch_size, self.hid_dim]
+                ),
+            ],
+            axis=1,
+        ).reshape([self.batch_size, -1])
+
+        sinusoidal_pos = self.rope._rotary_position_embedding(
+            self.seq_len + 1, self.dim_head, "float32"
+        )[:, :, -1:, :]
+        q, k = self.rope._apply_neox_rope(
+            sinusoidal_pos.astype("float16"), q, k
+        )
+        (
+            self.padding_offset,
+            self.cum_offset,
+            self.cu_seqlens_q,
+            self.cu_seqlens_k,
+        ) = get_padding_offset(self.batch_size, 1, self.seq_lens_this_time)
+        out_ = (
+            naive_attention_impl(
+                q,
+                k,
+                v,
+                naive_cache_k,
+                naive_cache_v,
+                None,
+                None,
+                None,
+                self.scale,
+            )
+            .transpose([0, 2, 1, 3])
+            .reshape([self.batch_size, -1])
+        )
+        out = block_multihead_attention_xpu(
+            qkv,
+            self.cache_k,
+            self.cache_v,
+            self.seq_lens_encoder,
+            self.seq_lens_decoder,
+            self.seq_lens_this_time,
+            self.padding_offset,
+            self.cum_offset,
+            self.cu_seqlens_q,
+            self.cu_seqlens_k,
+            self.block_tables,
+            self.cache_k_per_batch_maxs,
+            self.cache_v_per_batch_maxs,
+            None,  # pre_key_cache
+            None,  # pre_value_cache
+            None,  # cache_k_quant_scales
+            None,  # cache_v_quant_scales
+            None,  # cache_k_dequant_scales
+            None,  # cache_v_dequant_scales
+            None,  # qkv_out_scale
+            None,  # qkv_bias
+            None,  # out_shift
+            None,  # out_smooth
+            None,  # max_enc_len_this_time
+            None,  # max_dec_len_this_time
+            self.rope_emb,  # rotary_embs
+            None,  # attn_mask
+            None,  # tgt_mask
+            self.seq_len + self.max_dec_len,  # seq_len,
+            self.blocksize,
+            True,  # use_neox_rotary_style
+        )[0]
+        # NOTE: The diff of decoder is a little big
+        np.testing.assert_allclose(
+            out.numpy(),
+            out_.numpy(),
+            rtol=5e-02,
+            atol=5e-02,
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/xpu/test_collective_api_base.py b/test/xpu/test_collective_api_base.py
index 0c3d710a06335..c94061d5fc6d1 100644
--- a/test/xpu/test_collective_api_base.py
+++ b/test/xpu/test_collective_api_base.py
@@ -202,7 +202,7 @@ def setUp(self):
         self._trainers = 2
         self._ps_endpoints = f"127.0.0.1:{self._find_free_port()},127.0.0.1:{self._find_free_port()}"
         self._python_interp = sys.executable
-        self._master_endpoints = "127.0.0.1:%s" % (self._find_free_port())
+        self._master_endpoints = f"127.0.0.1:{self._find_free_port()}"
 
         self.temp_dir = tempfile.TemporaryDirectory()
 
@@ -300,15 +300,15 @@ def _run_cluster(self, model_file, envs):
 
         tr0_out, tr0_err = tr0_proc.communicate()
         tr1_out, tr1_err = tr1_proc.communicate()
-        sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err)
-        sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err)
+        sys.stderr.write(f'trainer 0 stderr: {tr0_err}\n')
+        sys.stderr.write(f'trainer 1 stderr: {tr1_err}\n')
         # close trainer file
         tr0_pipe.close()
         tr1_pipe.close()
         with open(path0, "r") as f:
-            sys.stderr.write('trainer 0 stderr file: %s\n' % f.read())
+            sys.stderr.write(f'trainer 0 stderr file: {f.read()}\n')
         with open(path1, "r") as f:
-            sys.stderr.write('trainer 1 stderr file: %s\n' % f.read())
+            sys.stderr.write(f'trainer 1 stderr file: {f.read()}\n')
 
         def load_and_remove(path):
             with open(path, 'rb') as f:
diff --git a/test/xpu/test_collective_base_xpu.py b/test/xpu/test_collective_base_xpu.py
index 8a3289f0eb02a..c6cd081b498d7 100644
--- a/test/xpu/test_collective_base_xpu.py
+++ b/test/xpu/test_collective_base_xpu.py
@@ -244,8 +244,8 @@ def _run_cluster(self, model_file, envs):
 
         tr0_out, tr0_err = tr0_proc.communicate()
         tr1_out, tr1_err = tr1_proc.communicate()
-        sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err)
-        sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err)
+        sys.stderr.write(f'trainer 0 stderr: {tr0_err}\n')
+        sys.stderr.write(f'trainer 1 stderr: {tr1_err}\n')
         # close trainer file
         tr0_pipe.close()
         tr1_pipe.close()
diff --git a/test/xpu/test_conv2d_op_xpu.py b/test/xpu/test_conv2d_op_xpu.py
index df36f226408eb..4c7419ae9e5fd 100644
--- a/test/xpu/test_conv2d_op_xpu.py
+++ b/test/xpu/test_conv2d_op_xpu.py
@@ -36,14 +36,14 @@ def conv2d_forward_naive(
 ):
     if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]:
         raise ValueError(
-            "Unknown Attr(padding_algorithm): '%s'. "
-            "It can only be 'SAME' or 'VALID'." % str(padding_algorithm)
+            f"Unknown Attr(padding_algorithm): '{str(padding_algorithm)}'. "
+            "It can only be 'SAME' or 'VALID'."
         )
 
     if data_format not in ["NCHW", "NHWC"]:
         raise ValueError(
-            "Unknown Attr(data_format): '%s' ."
-            "It can only be 'NCHW' or 'NHWC'." % str(data_format)
+            f"Unknown Attr(data_format): '{str(data_format)}' ."
+            "It can only be 'NCHW' or 'NHWC'."
         )
 
     channel_last = data_format == "NHWC"
diff --git a/test/xpu/test_conv2d_transpose_op_xpu.py b/test/xpu/test_conv2d_transpose_op_xpu.py
index 57c564335fbc1..1728889827992 100644
--- a/test/xpu/test_conv2d_transpose_op_xpu.py
+++ b/test/xpu/test_conv2d_transpose_op_xpu.py
@@ -31,8 +31,8 @@ def conv2dtranspose_forward_naive(input_, filter_, attrs):
     padding_algorithm = attrs['padding_algorithm']
     if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]:
         raise ValueError(
-            "Unknown Attr(padding_algorithm): '%s'. "
-            "It can only be 'SAME' or 'VALID'." % str(padding_algorithm)
+            f"Unknown Attr(padding_algorithm): '{str(padding_algorithm)}'. "
+            "It can only be 'SAME' or 'VALID'."
         )
 
     if attrs['data_format'] == 'NHWC':
diff --git a/test/xpu/test_conv3d_op_xpu.py b/test/xpu/test_conv3d_op_xpu.py
index 021c57821c12d..26582b4e1b2c5 100644
--- a/test/xpu/test_conv3d_op_xpu.py
+++ b/test/xpu/test_conv3d_op_xpu.py
@@ -31,14 +31,14 @@ def conv3d_forward_naive(
 ):
     if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]:
         raise ValueError(
-            "Unknown Attr(padding_algorithm): '%s'. "
-            "It can only be 'SAME' or 'VALID'." % str(padding_algorithm)
+            f"Unknown Attr(padding_algorithm): '{str(padding_algorithm)}'. "
+            "It can only be 'SAME' or 'VALID'."
         )
 
     if data_format not in ["NCDHW", "NDHWC"]:
         raise ValueError(
-            "Unknown Attr(data_format): '%s' ."
-            "It can only be 'NCDHW' or 'NDHWC'." % str(data_format)
+            f"Unknown Attr(data_format): '{str(data_format)}' ."
+            "It can only be 'NCDHW' or 'NDHWC'."
         )
 
     channel_last = data_format == "NDHWC"
diff --git a/test/xpu/test_depthwise_conv2d_transpose_op_xpu.py b/test/xpu/test_depthwise_conv2d_transpose_op_xpu.py
index 96077ae8c83d0..878519fbd507d 100644
--- a/test/xpu/test_depthwise_conv2d_transpose_op_xpu.py
+++ b/test/xpu/test_depthwise_conv2d_transpose_op_xpu.py
@@ -31,8 +31,8 @@ def depthwiseconv2dtranspose_forward_naive(input_, filter_, attrs):
     padding_algorithm = attrs['padding_algorithm']
     if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]:
         raise ValueError(
-            "Unknown Attr(padding_algorithm): '%s'. "
-            "It can only be 'SAME' or 'VALID'." % str(padding_algorithm)
+            f"Unknown Attr(padding_algorithm): '{str(padding_algorithm)}'. "
+            "It can only be 'SAME' or 'VALID'."
         )
 
     if attrs['data_format'] == 'NHWC':
diff --git a/test/xpu/test_parallel_dygraph_dataparallel.py b/test/xpu/test_parallel_dygraph_dataparallel.py
index 0070f8ade9802..3eed21553b7a5 100644
--- a/test/xpu/test_parallel_dygraph_dataparallel.py
+++ b/test/xpu/test_parallel_dygraph_dataparallel.py
@@ -73,9 +73,11 @@ def start_local_trainers(
     for t in pod.trainers:
         proc_env = {
             "PADDLE_DISTRI_BACKEND": "bkcl",
-            "FLAGS_selected_xpus": "%s" % ",".join([str(g) for g in t.gpus]),
+            "FLAGS_selected_xpus": "{}".format(
+                ",".join([str(g) for g in t.gpus])
+            ),
             "PADDLE_TRAINER_ID": "%d" % t.rank,
-            "PADDLE_CURRENT_ENDPOINT": "%s" % t.endpoint,
+            "PADDLE_CURRENT_ENDPOINT": f"{t.endpoint}",
             "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
             "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()),
         }
diff --git a/test/xpu/test_pool2d_op_xpu.py b/test/xpu/test_pool2d_op_xpu.py
index f62ffb4fc45a6..1d3c1def63bfb 100644
--- a/test/xpu/test_pool2d_op_xpu.py
+++ b/test/xpu/test_pool2d_op_xpu.py
@@ -172,8 +172,8 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
         padding_algorithm = padding_algorithm.upper()
         if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]:
             raise ValueError(
-                "Unknown Attr(padding_algorithm): '%s'. "
-                "It can only be 'SAME' or 'VALID'." % str(padding_algorithm)
+                f"Unknown Attr(padding_algorithm): '{str(padding_algorithm)}'. "
+                "It can only be 'SAME' or 'VALID'."
             )
 
         if padding_algorithm == "VALID":
diff --git a/test/xpu/test_pool3d_op_xpu.py b/test/xpu/test_pool3d_op_xpu.py
index 865029ad0d07d..01dd6d77b2b86 100644
--- a/test/xpu/test_pool3d_op_xpu.py
+++ b/test/xpu/test_pool3d_op_xpu.py
@@ -68,8 +68,8 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
         padding_algorithm = padding_algorithm.upper()
         if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]:
             raise ValueError(
-                "Unknown Attr(padding_algorithm): '%s'. "
-                "It can only be 'SAME' or 'VALID'." % str(padding_algorithm)
+                f"Unknown Attr(padding_algorithm): '{str(padding_algorithm)}'. "
+                "It can only be 'SAME' or 'VALID'."
             )
 
         if padding_algorithm == "VALID":
diff --git a/test/xpu/test_swiglu_op_xpu.py b/test/xpu/test_swiglu_op_xpu.py
new file mode 100644
index 0000000000000..35d8350c85e26
--- /dev/null
+++ b/test/xpu/test_swiglu_op_xpu.py
@@ -0,0 +1,172 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+import paddle.nn.functional as F
+from paddle.incubate.nn.functional import swiglu as fused_swiglu_impl
+
+
+def swiglu(x, y, out_grad):
+    if isinstance(x, np.ndarray):
+        x = paddle.to_tensor(x)
+        y = paddle.to_tensor(y)
+        out_grad = paddle.to_tensor(out_grad)
+
+    origin_x = x.detach().clone()
+    origin_x.stop_gradient = False
+    x = origin_x
+
+    origin_y = y.detach().clone()
+    origin_y.stop_gradient = False
+    y = origin_y
+
+    dtype = x.dtype
+    need_convert = False
+    assert dtype == y.dtype
+    output_dtype = dtype
+
+    out = F.silu(x) * y
+    if need_convert:
+        out = out.astype(dtype)
+    out.backward(out_grad)
+    ret = [
+        out.astype(output_dtype),
+        origin_x.grad.astype(output_dtype),
+        origin_y.grad.astype(output_dtype),
+    ]
+    return ret
+
+
+def fused_swiglu(x, y, out_grad):
+    x = x.detach().clone()
+    x.stop_gradient = False
+    if y is not None:
+        y = y.detach().clone()
+        y.stop_gradient = False
+    out = fused_swiglu_impl(x, y)
+    out.backward(out_grad)
+
+    output_dtype = x.dtype
+    ret = [
+        out.astype(output_dtype),
+    ]
+    if y is not None:
+        x_grad, y_grad = x.grad, y.grad
+    else:
+        x_grad, y_grad = paddle.split(x.grad, 2, axis=-1)
+
+    ret.append(x_grad.astype(output_dtype))
+    ret.append(y_grad.astype(output_dtype))
+    return ret
+
+
+tol_map = {
+    paddle.float64: [1e-8, 1e-8],
+    paddle.float32: [1e-6, 1e-6],
+    paddle.float16: [1e-3, 1e-3],
+    paddle.bfloat16: [1e-2, 1e-2],
+}
+
+
+class TestSwiGLUDygraph(unittest.TestCase):
+    def setUp(self):
+        self.init_case()
+        self.seed = 1234
+
+    def init_case(self):
+        self.shape = []
+        self.shape.append([8, 100])
+        self.shape.append([4, 102])
+
+    def check_dygraph_impl(self, device, shape, dtype):
+        x = paddle.randn(shape, dtype=dtype)
+        y = paddle.randn(shape, dtype=dtype)
+        out_grad = paddle.randn(shape, dtype=dtype)
+
+        ret1 = swiglu(x, y, out_grad)
+        ret2 = fused_swiglu(x, y, out_grad)
+        ret3 = fused_swiglu(paddle.concat([x, y], axis=-1), None, out_grad)
+
+        atol, rtol = tol_map[dtype]
+        err_msg = (
+            f"Failed when device = {device}, dtype = {dtype}, shape = {shape}"
+        )
+        for t1, t2, t3 in zip(ret1, ret2, ret3):
+            t1, t2, t3 = t1.numpy(), t2.numpy(), t3.numpy()
+            np.testing.assert_allclose(
+                t1, t2, atol=atol, rtol=rtol, err_msg=err_msg
+            )
+            np.testing.assert_equal(t2, t3, err_msg=err_msg)
+
+    def check_dygraph(self, shape):
+        metas = []
+        metas.append(('xpu', paddle.float32))
+        metas.append(('xpu', paddle.float64))
+        # Enable in KL3
+        # metas.append(('xpu', paddle.float16))
+        # metas.append(('xpu', paddle.bfloat16))
+
+        for device, dtype in metas:
+            origin_device = paddle.get_device()
+            paddle.set_device(device)
+            for with_split in [True]:
+                self.check_dygraph_impl(device, shape, dtype)
+            paddle.set_device(origin_device)
+
+    def check_static_graph(self, shape, dtype="float32"):
+        x = paddle.static.data(name='x', shape=shape, dtype=dtype)
+        y = paddle.static.data(name='y', shape=shape, dtype=dtype)
+        concated_x = paddle.static.data(
+            name='concated_x',
+            shape=list(shape[:-1]) + [shape[-1] * 2],
+            dtype=dtype,
+        )
+        out1 = fused_swiglu_impl(x, y)
+        out2 = fused_swiglu_impl(concated_x)
+
+        concated_x_np = np.random.random(concated_x.shape).astype(dtype)
+        x_np, y_np = np.split(concated_x_np, 2, axis=-1)
+
+        exe = paddle.static.Executor()
+        t1, t2 = exe.run(
+            feed={'x': x_np, 'y': y_np, 'concated_x': concated_x_np},
+            fetch_list=[out1, out2],
+        )
+        np.testing.assert_equal(out1, out2)
+
+    def check_main(self, shape):
+        self.check_dygraph(shape)
+        paddle.enable_static()
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            self.check_static_graph(shape)
+        paddle.disable_static()
+
+    def test_main(self):
+        for i in self.shape:
+            self.check_main(i)
+
+
+class TestSwigluOp(TestSwiGLUDygraph):
+    def init_case(self):
+        self.shape = [[1, 4096, 1376], [1, 4096, 11008]]
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/xpu/test_zero_dim_tensor_xpu.py b/test/xpu/test_zero_dim_tensor_xpu.py
index 133c9b1302013..ac5e2df75b46f 100644
--- a/test/xpu/test_zero_dim_tensor_xpu.py
+++ b/test/xpu/test_zero_dim_tensor_xpu.py
@@ -345,7 +345,7 @@ def test_dygraph_binary(self):
             # 1) x is 0D, y is 0D
             x_np = np.random.randint(-10, 10, [])
             y_np = np.random.randint(-10, 10, [])
-            out_np = eval('np.%s(x_np, y_np)' % api.__name__)
+            out_np = eval(f'np.{api.__name__}(x_np, y_np)')
 
             x = paddle.to_tensor(x_np)
             y = paddle.to_tensor(y_np)
@@ -357,7 +357,7 @@ def test_dygraph_binary(self):
             # 2) x is ND, y is 0D
             x_np = np.random.randint(-10, 10, [3, 5])
             y_np = np.random.randint(-10, 10, [])
-            out_np = eval('np.%s(x_np, y_np)' % api.__name__)
+            out_np = eval(f'np.{api.__name__}(x_np, y_np)')
 
             x = paddle.to_tensor(x_np)
             y = paddle.to_tensor(y_np)
@@ -369,7 +369,7 @@ def test_dygraph_binary(self):
             # 3) x is 0D , y is ND
             x_np = np.random.randint(-10, 10, [])
             y_np = np.random.randint(-10, 10, [3, 5])
-            out_np = eval('np.%s(x_np, y_np)' % api.__name__)
+            out_np = eval(f'np.{api.__name__}(x_np, y_np)')
 
             x = paddle.to_tensor(x_np)
             y = paddle.to_tensor(y_np)
diff --git a/third_party/onednn b/third_party/onednn
index 01204edbda1c2..0fb7e6ed4f32e 160000
--- a/third_party/onednn
+++ b/third_party/onednn
@@ -1 +1 @@
-Subproject commit 01204edbda1c2a4ff0cccd40476ed6bd2fb62d56
+Subproject commit 0fb7e6ed4f32e5d89832b2bd742bbf834cd296ed
diff --git a/tools/CheckPRTemplate.py b/tools/CheckPRTemplate.py
index 1cc601dba0a29..a3a350d107af6 100644
--- a/tools/CheckPRTemplate.py
+++ b/tools/CheckPRTemplate.py
@@ -79,7 +79,7 @@ def parameter_accuracy(body):
             for i in value:
                 i = i.strip().lower()
                 if i not in test_list_lower:
-                    single_mess += '%s.' % i
+                    single_mess += f'{i}.'
             if len(single_mess) != 0:
                 message += f'{key} should be in {test_list}. but now is [{single_mess}].'
     return message
diff --git a/tools/CrossStackProfiler/CspFileReader.py b/tools/CrossStackProfiler/CspFileReader.py
index 28038b5c76d3b..5802edc965cca 100755
--- a/tools/CrossStackProfiler/CspFileReader.py
+++ b/tools/CrossStackProfiler/CspFileReader.py
@@ -108,7 +108,7 @@ def printArgs(self):
 
     def _checkArgsKey(self, key, type):
         if key not in self._args:
-            raise KeyError("args should has key [%s]!" % key)
+            raise KeyError(f"args should has key [{key}]!")
 
         if not isinstance(self._args[key], type):
             raise TypeError(
@@ -130,17 +130,14 @@ def _checkArgs(self):
             or self._organizeForm == FILEORGANIZEFORM_BYOTHER
         ):
             raise NotImplementedError(
-                "we have not known how to process this form of file [%s]!"
-                % self._organizeForm
+                f"we have not known how to process this form of file [{self._organizeForm}]!"
             )
 
         self._checkArgsKey("gpuPerTrainer", int)
 
         self._checkArgsKey("dataPath", str)
         if not os.path.exists(self._dataPath):
-            raise OSError(
-                "input data path [%s] not existed!" % (self._dataPath)
-            )
+            raise OSError(f"input data path [{self._dataPath}] not existed!")
 
         self._checkArgsKey("groupSize", int)
         self._checkArgsKey("displaySize", int)
@@ -183,8 +180,7 @@ def _getFileList(self):
                 newFileList.append(file)
             else:
                 raise NotImplementedError(
-                    "[%s] is repeated by id, we don not how to process it!"
-                    % file
+                    f"[{file}] is repeated by id, we don not how to process it!"
                 )
 
         if not self._fileList:
@@ -201,7 +197,7 @@ def _sortBySuffix(elem):
 
         if not self._fileList:
             self._logger.warning(
-                "we can not find any file in dir [%s]!" % self._dataPath
+                f"we can not find any file in dir [{self._dataPath}]!"
             )
         else:
             self._logger.info(
@@ -215,12 +211,11 @@ def _sortBySuffix(elem):
     def _getId(self, fileName, organizeForm, sed="."):
         if self._organizeForm != organizeForm:
             raise TypeError(
-                "Can not get rank id when organizer form is not %s!"
-                % organizeForm
+                f"Can not get rank id when organizer form is not {organizeForm}!"
             )
 
         if not os.path.isfile(fileName):
-            raise OSError("[%s] is not a valid file!" % (fileName))
+            raise OSError(f"[{fileName}] is not a valid file!")
 
         try:
             prefix_str = fileName.split(sed)[-1]
@@ -228,13 +223,12 @@ def _getId(self, fileName, organizeForm, sed="."):
                 return int(prefix_str)
             except ValueError as e:
                 print(e)
-                raise TypeError("invalid fileName [%s]" % fileName)
+                raise TypeError(f"invalid fileName [{fileName}]")
 
         except IndexError as e:
             print(e)
             raise TypeError(
-                "invalid fileName [%s], the prefix should be a number!"
-                % fileName
+                f"invalid fileName [{fileName}], the prefix should be a number!"
             )
 
     def getRankId(self, fileName, sed="."):
@@ -298,19 +292,15 @@ def getDcgmInfoDict(self, groupId, gpuId, tmpPath="./tmp"):
     def getDict(self, name, groupId, gpuId, tmpPath="./tmp"):
         fileName = self.getFileName(name, groupId, gpuId, tmpPath)
         if not os.path.isfile(fileName):
-            raise OSError("[%s] is not existed!" % fileName)
+            raise OSError(f"[{fileName}] is not existed!")
 
         data = {}
         with open(fileName, "r") as rf:
             try:
                 data = json.load(rf)
             except Exception:
-                self._logger.error(
-                    "read [%s] error. not a json file!" % (fileName)
-                )
-                raise TypeError(
-                    "read [%s] error. not a json file!" % (fileName)
-                )
+                self._logger.error(f"read [{fileName}] error. not a json file!")
+                raise TypeError(f"read [{fileName}] error. not a json file!")
         return data
 
     def dumpOpInfoDict(
@@ -344,7 +334,7 @@ def dumpDict(
         fileObject = open(fileName, 'w')
         fileObject.write(jsObj)
         fileObject.close()
-        self._logger.info("dump [%s] successfully!" % fileName)
+        self._logger.info(f"dump [{fileName}] successfully!")
 
 
 def getLogger():
diff --git a/tools/CrossStackProfiler/DCGMFileReader.py b/tools/CrossStackProfiler/DCGMFileReader.py
index f462ce5c9ad5e..eb31ad7820a78 100755
--- a/tools/CrossStackProfiler/DCGMFileReader.py
+++ b/tools/CrossStackProfiler/DCGMFileReader.py
@@ -88,7 +88,7 @@ def parseFileByGroup(self, groupId, processNum=8):
     def _parseTask(self, taskList, q=None):
         is_first = True
         for fileName in taskList:
-            self._logger.info("I am processing %s!" % fileName)
+            self._logger.info(f"I am processing {fileName}!")
             tmp_data = self._parseSingleFile(fileName)
             if tmp_data is None:
                 continue
@@ -103,7 +103,7 @@ def _parseTask(self, taskList, q=None):
         dcgm_data = dcgm_data.dropna()
         if q is not None:
             q.put(dcgm_data)
-        self._logger.info("I finish processing %s!" % fileName)
+        self._logger.info(f"I finish processing {fileName}!")
         return dcgm_data
 
     def _parseSingleFile(self, fileName):
@@ -192,7 +192,7 @@ def _getDCGMTraceInfoByGpuId(
 
                 di = {}
                 # name = "%s_%d" % (metric, trainerId)
-                name = "%s" % (metric)
+                name = f"{metric}"
                 di['name'] = name
                 di['pid'] = pid_map[metric]
                 di['ts'] = self._align_ts(int(row['ts']))
diff --git a/tools/CrossStackProfiler/ProfileFileReader.py b/tools/CrossStackProfiler/ProfileFileReader.py
index af955bd6652c4..266e9e5cf706d 100755
--- a/tools/CrossStackProfiler/ProfileFileReader.py
+++ b/tools/CrossStackProfiler/ProfileFileReader.py
@@ -46,7 +46,7 @@ def _parseTask(self, taskList, q=None):
             profile_dict["trainerRank.%03d" % (rankId)] = self._parseSingleFile(
                 fileName
             )
-            self._logger.info("I finish processing %s!" % fileName)
+            self._logger.info(f"I finish processing {fileName}!")
 
         if q is not None:
             q.put(profile_dict)
diff --git a/tools/analysisPyXml.py b/tools/analysisPyXml.py
index 2f2d8b472c566..9d9ec062180cb 100644
--- a/tools/analysisPyXml.py
+++ b/tools/analysisPyXml.py
@@ -31,7 +31,7 @@ def analysisPyXml(rootPath, ut):
     for clazz in root.findall('packages/package/classes/class'):
         clazz_filename = clazz.attrib.get('filename')
         if not clazz_filename.startswith('/paddle'):
-            clazz_filename = '/paddle/%s' % clazz_filename
+            clazz_filename = f'/paddle/{clazz_filename}'
         for line in clazz.findall('lines/line'):
             line_hits = int(line.attrib.get('hits'))
             if line_hits != 0:
diff --git a/tools/analysis_build_time.py b/tools/analysis_build_time.py
index 6ae3ee6bbacc1..ae340a1bcfe03 100644
--- a/tools/analysis_build_time.py
+++ b/tools/analysis_build_time.py
@@ -33,10 +33,10 @@ def getUsefulBuildTimeFile(filename):
 
 
 def analysisBuildTime():
-    filename = '%s/build/build-time' % root_path
+    filename = f'{root_path}/build/build-time'
     getUsefulBuildTimeFile(filename)
-    os.system('rm -rf %s/tools/tempbuildTime.txt' % root_path)
-    with open('%s/tools/analysis_build_time' % root_path, 'r') as f:
+    os.system(f'rm -rf {root_path}/tools/tempbuildTime.txt')
+    with open(f'{root_path}/tools/analysis_build_time', 'r') as f:
         lines = f.readlines()
         for line in lines:
             try:
diff --git a/tools/auto_parallel/ci_auto_parallel.sh b/tools/auto_parallel/ci_auto_parallel.sh
index ab7a3c60c5874..2fbb47ec37112 100644
--- a/tools/auto_parallel/ci_auto_parallel.sh
+++ b/tools/auto_parallel/ci_auto_parallel.sh
@@ -34,7 +34,7 @@ cd ${paddle_dir}
 # get the location of "test/auto_parallel" in target_lists_for_semi_auto_ci
 count=0
 for element in "${target_lists_for_semi_auto_ci[@]}";do
-  if [[ "$element" == "test/auto_parallel" ]]; then  
+  if [[ "$element" == "test/auto_parallel" ]]; then
     test_auto_num=$count
     break
   fi
@@ -43,7 +43,7 @@ done
 # get the location of "test/collective/hybrid_strategy" in target_lists_for_dygraph_ci
 count=0
 for element in "${target_lists_for_dygraph_ci[@]}";do
-  if [[ "$element" == "test/collective/hybrid_strategy" ]]; then  
+  if [[ "$element" == "test/collective/hybrid_strategy" ]]; then
     test_dygraph_num=$count
     break
   fi
@@ -64,7 +64,7 @@ for file_name in `git diff --numstat upstream/${AGILE_COMPILE_BRANCH} |awk '{pri
     elif [[ ${file_name##*.} == "md" ]] || [[ ${file_name##*.} == "rst" ]] || [[ ${dir1} == "docs" ]];then
         continue
     else
-        # The most auto unittests have been monitored in PR-CI-Distribute-stable, 
+        # The most auto unittests have been monitored in PR-CI-Distribute-stable,
         # while the other tests of llama model will be executed in PR-CI-Auto-Parallel.
         for ((i=0; i<${#target_lists_for_semi_auto_ci[@]}; i++)); do
             if [[ $i != ${test_auto_num} ]] && [[ ${file_item} == *${target_lists_for_semi_auto_ci[i]}* ]];then
@@ -122,7 +122,7 @@ get_diff_TO_case
 
 ####################
 if [[ "${case_list[*]}" == *"gpt-3_auto"* ]] && [[ "${case_list[*]}" == *"gpt-3_auto_pir"* ]]; then
-    echo "同时命中gpt-3_auto 和 gpt-3_auto_pir, 只执行新ir, 不执行旧ir"  
+    echo "同时命中gpt-3_auto 和 gpt-3_auto_pir, 只执行新ir, 不执行旧ir"
     case_list=("${case_list[@]/*gpt-3_auto_pir*/}")
     case_list=("${case_list[@]/*gpt-3_auto*/}")
     case_list[${#case_list[*]}]=gpt-3_auto_pir
@@ -135,7 +135,7 @@ if [[ ${#case_list[*]} -ne 0 ]];then
     echo -e "\033[31m ---- case_list length: ${#case_list[*]}, cases: ${case_list[*]} \033"
     echo -e "\033[31m ============================= \033"
     set +e
-    
+
     # Install paddle
     install_paddle
     case_num=1
diff --git a/tools/check_added_ut.sh b/tools/check_added_ut.sh
index b036c08e1d93e..6d422774d12ed 100644
--- a/tools/check_added_ut.sh
+++ b/tools/check_added_ut.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -32,7 +32,7 @@ if [[ "$SYSTEM" == "Linux" ]] || [[ "$SYSTEM" == "Darwin" ]];then
     cp $PADDLE_ROOT/paddle/scripts/paddle_build.sh $PADDLE_ROOT/paddle/scripts/paddle_build_pre.sh
 elif [[ "$SYSTEM" == "Windows_NT" ]];then
     git remote | grep upstream
-    if [ $? != 0 ]; then 
+    if [ $? != 0 ]; then
         git remote add upstream https://github.com/PaddlePaddle/Paddle.git
     fi
     git fetch upstream ${BRANCH}
diff --git a/tools/check_api_approvals.sh b/tools/check_api_approvals.sh
index 4a8e7cf708994..7819072687da7 100644
--- a/tools/check_api_approvals.sh
+++ b/tools/check_api_approvals.sh
@@ -40,12 +40,18 @@ function add_failed(){
 
 api_params_diff=`python ${PADDLE_ROOT}/tools/check_api_compatible.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.spec  ${PADDLE_ROOT}/paddle/fluid/API_PR.spec`
 api_spec_diff=`python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.spec.api  ${PADDLE_ROOT}/paddle/fluid/API_PR.spec.api`
+api_annotation_diff=`python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.spec.annotations  ${PADDLE_ROOT}/paddle/fluid/API_PR.spec.annotations`
 if [ "$api_spec_diff" != "" -o "${api_params_diff}" != "" ]; then
     echo_line="You must have one RD (XiaoguangHu01, jeff41404, lanxianghit or qingqing01) approval for API change.\n"
 
     check_approval 1 XiaoguangHu01 jeff41404 lanxianghit qingqing01
 fi
 
+if [ "$api_annotation_diff" != "" ]; then
+    echo_line="You must have one member of Typing group (SigureMo, megemini, zrr1999, sunzhongkai588, luotao1) approval for API annotation change.\n"
+    check_approval 1 SigureMo megemini zrr1999 sunzhongkai588 luotao1
+fi
+
 api_yaml_diff=`python ${PADDLE_ROOT}/tools/check_api_yaml_same.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.spec  ${PADDLE_ROOT}/paddle/fluid/API_PR.spec ${BRANCH} ${PADDLE_ROOT}`
 if [ "$api_yaml_diff" != "" ]; then
     echo_line="API's name and params should be consistent with op's name and params in yaml.
@@ -133,7 +139,7 @@ if [ -n "${echo_list}" ];then
   echo "**************************************************************"
 
   # L40 L48 L62 has fetch the result out, but there are splitted.
-  if [ "${api_spec_diff}" != "" -o "${api_doc_spec_diff}" != "" ] ; then
+  if [ "${api_spec_diff}" != "" -o "${api_annotation_diff}" != "" ] ; then
     python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.spec  ${PADDLE_ROOT}/paddle/fluid/API_PR.spec
   fi
   if [ "${api_params_diff}" != "" ] ; then
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index d637c4f0c3b82..c844c09565da3 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -21,11 +21,12 @@ fi
 
 PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../" && pwd )"
 # If you want to add monitoring file modifications, please perform the. github/CODEOWNERS operation
-API_FILES=("tools/print_signatures.py"
-           "tools/sampcd_processor.py"
-           "tools/check_pr_approval.py"
-	   "tools/checkout_api_compatible.py"
-           )
+API_FILES=(
+    "tools/print_signatures.py"
+    "tools/sampcd_processor.py"
+    "tools/check_pr_approval.py"
+    "tools/checkout_api_compatible.py"
+)
 
 approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
 git_files=`git diff --numstat upstream/$BRANCH| wc -l`
diff --git a/tools/check_op_benchmark_result.py b/tools/check_op_benchmark_result.py
index ca3df4bb99eef..82f7967133576 100644
--- a/tools/check_op_benchmark_result.py
+++ b/tools/check_op_benchmark_result.py
@@ -21,7 +21,7 @@
 
 def check_path_exists(path):
     """Assert whether file/directory exists."""
-    assert os.path.exists(path), "%s does not exist." % path
+    assert os.path.exists(path), f"{path} does not exist."
 
 
 def parse_case_name(log_file_name):
@@ -48,7 +48,7 @@ def parse_log_file(log_file):
                 pass  # do nothing
 
     if result is None:
-        logging.warning("Parse %s fail!" % log_file)
+        logging.warning(f"Parse {log_file} fail!")
 
     return result
 
@@ -81,29 +81,29 @@ def check_speed_result(case_name, develop_data, pr_data, pr_result):
     develop_total_time = develop_data.get("total")
     total_time_diff = (pr_total_time - develop_total_time) / develop_total_time
 
-    logging.info("------ OP: %s ------" % case_name)
+    logging.info(f"------ OP: {case_name} ------")
     logging.info(
         f"GPU time change: {gpu_time_diff_str} (develop: {develop_gpu_time:.7f} -> PR: {pr_gpu_time:.7f})"
     )
     logging.info(
         f"Total time change: {total_time_diff * 100:.5f}% (develop: {develop_total_time:.7f} -> PR: {pr_total_time:.7f})"
     )
-    logging.info("backward: %s" % pr_result.get("backward"))
+    logging.info("backward: {}".format(pr_result.get("backward")))
     logging.info("parameters:")
     for line in pr_result.get("parameters").strip().split("\n"):
-        logging.info("\t%s" % line)
+        logging.info(f"\t{line}")
 
     return gpu_time_diff > 0.05
 
 
 def check_accuracy_result(case_name, pr_result):
     """Check accuracy result."""
-    logging.info("------ OP: %s ------" % case_name)
-    logging.info("Accuracy diff: %s" % pr_result.get("diff"))
-    logging.info("backward: %s" % pr_result.get("backward"))
+    logging.info(f"------ OP: {case_name} ------")
+    logging.info("Accuracy diff: {}".format(pr_result.get("diff")))
+    logging.info("backward: {}".format(pr_result.get("backward")))
     logging.info("parameters:")
     for line in pr_result.get("parameters").strip().split("\n"):
-        logging.info("\t%s" % line)
+        logging.info(f"\t{line}")
 
     return not pr_result.get("consistent")
 
@@ -154,11 +154,11 @@ def update_api_info_file(fail_case_list, api_info_file):
 def summary_results(check_results, api_info_file):
     """Summary results and return sys.exit code."""
     for case_name in check_results["speed"]:
-        logging.error("Check speed result with case \"%s\" failed." % case_name)
+        logging.error(f"Check speed result with case \"{case_name}\" failed.")
 
     for case_name in check_results["accuracy"]:
         logging.error(
-            "Check accuracy result with case \"%s\" failed." % case_name
+            f"Check accuracy result with case \"{case_name}\" failed."
         )
 
     if len(check_results["speed"]) and api_info_file:
diff --git a/tools/check_sequence_op.sh b/tools/check_sequence_op.sh
index 35357476a3224..51a482c3e9306 100644
--- a/tools/check_sequence_op.sh
+++ b/tools/check_sequence_op.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/tools/ci_op_benchmark.sh b/tools/ci_op_benchmark.sh
index 58e327327e6ad..93eb52a4f16aa 100644
--- a/tools/ci_op_benchmark.sh
+++ b/tools/ci_op_benchmark.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -333,6 +333,6 @@ fi
 case $1 in
   run_op_benchmark)
     prepare_env
-    gpu_op_benchmark 
+    gpu_op_benchmark
   ;;
 esac
diff --git a/tools/cinn/ci_build.sh b/tools/cinn/ci_build.sh
index 19aef611d7158..18e133fb1bfe6 100755
--- a/tools/cinn/ci_build.sh
+++ b/tools/cinn/ci_build.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2021 CINN Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/tools/cinn/docker/Dockerfile b/tools/cinn/docker/Dockerfile
index fcbe406ea46af..180e8ff78dd38 100644
--- a/tools/cinn/docker/Dockerfile
+++ b/tools/cinn/docker/Dockerfile
@@ -16,12 +16,12 @@ ENV HOME /root
 RUN apt-get update && \
     apt-get install -y software-properties-common && add-apt-repository ppa:deadsnakes/ppa && \
     apt-get update && \
-    apt-get install -y curl wget vim git unzip unrar tar xz-utils bzip2 gzip \ 
+    apt-get install -y curl wget vim git unzip unrar tar xz-utils bzip2 gzip \
         coreutils ntp language-pack-zh-hans python-qt4 libsm6 libxext6 libxrender-dev
 
 
 # Downgrade gcc&&g++
-WORKDIR /usr/bin 
+WORKDIR /usr/bin
 RUN apt-get update --fix-missing
 COPY script_build /script_build
 RUN bash /script_build/install_gcc.sh gcc82 && rm -rf /script_build && \
@@ -30,7 +30,7 @@ RUN bash /script_build/install_gcc.sh gcc82 && rm -rf /script_build && \
     ln -s /usr/local/gcc-8.2/bin/g++ /usr/local/bin/g++ && \
     ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/gcc && \
     ln -s /usr/local/gcc-8.2/bin/g++ /usr/bin/g++
-ENV PATH=/usr/local/gcc-8.2/bin:$PATH 
+ENV PATH=/usr/local/gcc-8.2/bin:$PATH
 
 RUN apt-get update && \
     apt-get install -y python3.6 python3.6-dev python3.6-venv && \
@@ -43,8 +43,8 @@ RUN wget -q https://cmake.org/files/v3.20/cmake-3.20.0-linux-x86_64.tar.gz && ta
 ENV PATH=/home/cmake-3.20.0-linux-x86_64/bin:$PATH
 
 # remove them when apt-get support 2.27 and higher version
-RUN wget -q https://ftp.gnu.org/gnu/binutils/binutils-2.33.1.tar.gz && \ 
-    tar -xzf binutils-2.33.1.tar.gz && \ 
+RUN wget -q https://ftp.gnu.org/gnu/binutils/binutils-2.33.1.tar.gz && \
+    tar -xzf binutils-2.33.1.tar.gz && \
     cd binutils-2.33.1 && \
     ./configure && make -j && make install && cd .. && rm -rf binutils-2.33.1 binutils-2.33.1.tar.gz
 
@@ -99,7 +99,7 @@ RUN wget https://paddle-ci.gz.bcebos.com/ccache-3.7.9.tar.gz && \
     make -j8 && make install && \
     ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache
 
-# For CINN environment 
+# For CINN environment
 RUN apt update --fix-missing && \
     apt install autoconf autogen libtool zlib1g-dev sudo libginac-dev clang cmake -y && \
     apt remove python3-six python-six -y && \
diff --git a/tools/cinn/docker/Dockerfile.ci b/tools/cinn/docker/Dockerfile.ci
index 53e1bbf64ec51..c91ecbb3641d5 100644
--- a/tools/cinn/docker/Dockerfile.ci
+++ b/tools/cinn/docker/Dockerfile.ci
@@ -1,5 +1,5 @@
 # Use SHA to specify the docker image to prevent the use of old cache images
-FROM registry.baidubce.com/paddlepaddle/paddle:latest-dev-cuda11.8-cudnn8.6-trt8.5-gcc82 
+FROM registry.baidubce.com/paddlepaddle/paddle:latest-dev-cuda11.8-cudnn8.6-trt8.5-gcc82
 
 # NVIDIA update GPG key on 04/29/2022. Fetch the public key for CI machine
 # Reference: https://developer.nvidia.com/blog/updating-the-cuda-linux-gpg-repository-key/
diff --git a/tools/cinn/docker/script_build/install_gcc.sh b/tools/cinn/docker/script_build/install_gcc.sh
index e744e9ddac66e..46470b179ad88 100644
--- a/tools/cinn/docker/script_build/install_gcc.sh
+++ b/tools/cinn/docker/script_build/install_gcc.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -30,7 +30,7 @@ else
 fi
 
 if [ "$1" == "gcc82" ]; then
-  wget -q https://paddle-ci.gz.bcebos.com/gcc-8.2.0.tar.xz 
+  wget -q https://paddle-ci.gz.bcebos.com/gcc-8.2.0.tar.xz
   tar -xvf gcc-8.2.0.tar.xz && \
   cd gcc-8.2.0 && \
   unset LIBRARY_PATH CPATH C_INCLUDE_PATH PKG_CONFIG_PATH CPLUS_INCLUDE_PATH INCLUDE && \
@@ -39,12 +39,12 @@ if [ "$1" == "gcc82" ]; then
   ../gcc-8.2.0/configure --prefix=/usr/local/gcc-8.2 --enable-threads=posix --disable-checking --disable-multilib && \
   make -j8 && make install
   cd .. && rm -rf temp_gcc82
-  cp ${lib_so_6} ${lib_so_6}.bak  && rm -f ${lib_so_6} && 
+  cp ${lib_so_6} ${lib_so_6}.bak  && rm -f ${lib_so_6} &&
   ln -s /usr/local/gcc-8.2/lib64/libgfortran.so.5 ${lib_so_5} && \
   ln -s /usr/local/gcc-8.2/lib64/libstdc++.so.6 ${lib_so_6} && \
   cp /usr/local/gcc-8.2/lib64/libstdc++.so.6.0.25 ${lib_path}
 elif [ "$1" == "gcc54" ]; then
-  wget -q http://ftp.tsukuba.wide.ad.jp/software/gcc/releases/gcc-5.4.0/gcc-5.4.0.tar.bz2 
+  wget -q http://ftp.tsukuba.wide.ad.jp/software/gcc/releases/gcc-5.4.0/gcc-5.4.0.tar.bz2
   tar -xvf gcc-5.4.0.tar.bz2 && \
   cd gcc-5.4.0 && \
   unset LIBRARY_PATH CPATH C_INCLUDE_PATH PKG_CONFIG_PATH CPLUS_INCLUDE_PATH INCLUDE && \
@@ -53,7 +53,7 @@ elif [ "$1" == "gcc54" ]; then
   ../gcc-5.4.0/configure --prefix=/usr/local/gcc-5.4 --enable-checking=release --enable-languages=c,c++ --disable-multilib && \
   make -j8 && make install
   cd .. && rm -rf temp_gcc54
-  cp ${lib_so_6} ${lib_so_6}.bak  && rm -f ${lib_so_6} && 
+  cp ${lib_so_6} ${lib_so_6}.bak  && rm -f ${lib_so_6} &&
   ln -s /usr/local/gcc-5.4/lib64/libgfortran.so.5 ${lib_so_5} && \
   ln -s /usr/local/gcc-5.4/lib64/libstdc++.so.6 ${lib_so_6} && \
   cp /usr/local/gcc-5.4/lib64/libstdc++.so.6.0.21 ${lib_path}
diff --git a/tools/cinn/gen_c++_tutorial.py b/tools/cinn/gen_c++_tutorial.py
index 97e6d16fef088..be391b44ef730 100644
--- a/tools/cinn/gen_c++_tutorial.py
+++ b/tools/cinn/gen_c++_tutorial.py
@@ -59,13 +59,13 @@ def code_block(self, lang: str, block: List[str]):
                 break
             else:
                 tail_valid_offset += 1
-        logging.warning("block0: %s" % block)
+        logging.warning(f"block0: {block}")
         block = (
             block[pre_valid_offset:-tail_valid_offset]
             if tail_valid_offset > 0
             else block[pre_valid_offset:]
         )
-        logging.warning("block1: %s" % block)
+        logging.warning(f"block1: {block}")
         if not block:
             return
 
@@ -189,7 +189,7 @@ def eat_roc(self, header: str, content: ContentGenerator) -> None:
             code_block.append(line)
             line: str = content.get_line()
 
-        logging.warning("DOC content: %s" % code_block)
+        logging.warning(f"DOC content: {code_block}")
 
         self.doc.code_block(lang, code_block)
 
diff --git a/tools/codestyle/clang-tidy.py b/tools/codestyle/clang-tidy.py
index 404413b9b9945..7fe5029cd1823 100644
--- a/tools/codestyle/clang-tidy.py
+++ b/tools/codestyle/clang-tidy.py
@@ -166,9 +166,9 @@ def get_tidy_invocation(
         os.close(handle)
         start.append(name)
     for arg in extra_arg:
-        start.append('-extra-arg=%s' % arg)
+        start.append(f'-extra-arg={arg}')
     for arg in extra_arg_before:
-        start.append('-extra-arg-before=%s' % arg)
+        start.append(f'-extra-arg-before={arg}')
     start.append('-p=' + build_path)
     if quiet:
         start.append('-quiet')
diff --git a/tools/continuous_integration/bisect.py b/tools/continuous_integration/bisect.py
index c4b31bb6e8729..2feaf7be5ec6e 100644
--- a/tools/continuous_integration/bisect.py
+++ b/tools/continuous_integration/bisect.py
@@ -84,11 +84,11 @@ def print_arguments():
     [f'git rev-list --first-parent {args.good_commit}...{args.bad_commit}'],
     shell=True,
 )
-sys.stdout.write('commits found:\n%s\n' % ret)
+sys.stdout.write(f'commits found:\n{ret}\n')
 commits = ret.strip().split('\n')
 os.chdir(args.build_dir)
 # Clean up previous logs.
-subprocess.check_output(['echo "" > %s' % args.log_file], shell=True)
+subprocess.check_output([f'echo "" > {args.log_file}'], shell=True)
 
 last_culprit = ''
 while True:
@@ -96,8 +96,7 @@ def print_arguments():
     os.chdir(args.git_dir)
     subprocess.check_output(
         [
-            'git checkout %s && git clean -fd && git checkout .'
-            % args.bisect_branch
+            f'git checkout {args.bisect_branch} && git clean -fd && git checkout .'
         ],
         shell=True,
     )
@@ -109,7 +108,7 @@ def print_arguments():
     pick_idx = len(commits) / 2
     pick = commits[pick_idx]
     os.chdir(args.git_dir)
-    subprocess.check_output(['git checkout %s' % pick], shell=True)
+    subprocess.check_output([f'git checkout {pick}'], shell=True)
 
     # Clean builds and compile.
     # We assume mainline commits should always compile.
@@ -120,7 +119,7 @@ def print_arguments():
         'rm -rf * && '
         f'cmake -DWITH_TESTING=ON {args.git_dir} >> {args.log_file} && make -j{args.build_parallel} >> {args.log_file}'
     )
-    sys.stdout.write('cmd: %s\n' % cmd)
+    sys.stdout.write(f'cmd: {cmd}\n')
     try:
         subprocess.check_output([cmd], shell=True)
     except subprocess.CalledProcessError as e:
@@ -130,7 +129,7 @@ def print_arguments():
     passed = True
     try:
         cmd = f'ctest --repeat-until-fail {args.test_times} -R {args.test_target} >> {args.log_file}'
-        sys.stdout.write('cmd: %s\n' % cmd)
+        sys.stdout.write(f'cmd: {cmd}\n')
         subprocess.check_output([cmd], shell=True)
     except subprocess.CalledProcessError as e:
         passed = False
@@ -145,4 +144,4 @@ def print_arguments():
             break
         commits = commits[pick_idx + 1 :]
 
-sys.stdout.write('Culprit commit: %s\n' % last_culprit)
+sys.stdout.write(f'Culprit commit: {last_culprit}\n')
diff --git a/tools/dockerfile/Dockerfile.centos b/tools/dockerfile/Dockerfile.centos
index c9ae968c920c5..be2a97b036191 100644
--- a/tools/dockerfile/Dockerfile.centos
+++ b/tools/dockerfile/Dockerfile.centos
@@ -16,8 +16,8 @@ ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig
 RUN yum install -y bzip2 gettext-devel sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz wget curl-devel patch
 COPY build_scripts /build_scripts
 RUN bash build_scripts/build.sh
-#RUN bash build_scripts/install_nccl2.sh 
-RUN bash build_scripts/install_trt.sh 
+#RUN bash build_scripts/install_nccl2.sh
+RUN bash build_scripts/install_trt.sh
 RUN rm -rf build_scripts
 RUN ln -s /usr/local/ssl/include/openssl /usr/include
 
@@ -26,7 +26,7 @@ RUN wget -q https://paddle-ci.gz.bcebos.com/git-2.17.1.tar.gz && \
   tar -xvf git-2.17.1.tar.gz && \
   cd git-2.17.1 && \
   ./configure --with-openssl --prefix=/usr/local && \
-  make -j8 && make install 
+  make -j8 && make install
 
 ENV SSL_CERT_FILE=/opt/_internal/certs.pem
 ENV GOROOT=/usr/local/go GOPATH=/root/gopath
@@ -43,7 +43,7 @@ RUN wget --no-check-certificate -qO- https://paddle-ci.gz.bcebos.com/go1.15.12.l
 
 
 # protobuf 3.6.1
-RUN cd /opt && wget -q --no-check-certificate https://paddle-ci.cdn.bcebos.com/protobuf-cpp-3.6.1.tar.gz && \ 
+RUN cd /opt && wget -q --no-check-certificate https://paddle-ci.cdn.bcebos.com/protobuf-cpp-3.6.1.tar.gz && \
     tar xzf protobuf-cpp-3.6.1.tar.gz && \
     cd protobuf-3.6.1 && ./configure && make -j4 && make install && cd .. && rm -f protobuf-cpp-3.6.1.tar.gz
 
diff --git a/tools/dockerfile/Dockerfile.ipu b/tools/dockerfile/Dockerfile.ipu
index 7b9e15bbf5ff1..ef55b8920559f 100644
--- a/tools/dockerfile/Dockerfile.ipu
+++ b/tools/dockerfile/Dockerfile.ipu
@@ -27,14 +27,14 @@ RUN apt-get update && apt-get install -y rdma-core librdmacm1
 
 # Downgrade gcc&&g++
 WORKDIR /usr/bin
-COPY tools/dockerfile/build_scripts /build_scripts 
-RUN bash /build_scripts/install_gcc.sh gcc82 && rm -rf /build_scripts 
-RUN cp gcc gcc.bak && cp g++ g++.bak && rm gcc && rm g++ 
-RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/local/bin/gcc 
-RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/local/bin/g++ 
-RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/gcc 
-RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/bin/g++ 
-ENV PATH=/usr/local/gcc-8.2/bin:$PATH 
+COPY tools/dockerfile/build_scripts /build_scripts
+RUN bash /build_scripts/install_gcc.sh gcc82 && rm -rf /build_scripts
+RUN cp gcc gcc.bak && cp g++ g++.bak && rm gcc && rm g++
+RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/local/bin/gcc
+RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/local/bin/g++
+RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/gcc
+RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/bin/g++
+ENV PATH=/usr/local/gcc-8.2/bin:$PATH
 
 # install cmake
 WORKDIR /home
diff --git a/tools/dockerfile/Dockerfile.release.ubuntu20 b/tools/dockerfile/Dockerfile.release.ubuntu20
index 7a14eb6534afa..397ca3cdfce96 100644
--- a/tools/dockerfile/Dockerfile.release.ubuntu20
+++ b/tools/dockerfile/Dockerfile.release.ubuntu20
@@ -27,26 +27,26 @@ RUN apt-get update --allow-unauthenticated && \
   apt-get install -y software-properties-common && \
   add-apt-repository ppa:deadsnakes/ppa && \
   apt-get update && \
-  apt-get install -y curl wget vim git unzip unrar tar xz-utils libssl-dev bzip2 gzip \ 
+  apt-get install -y curl wget vim git unzip unrar tar xz-utils libssl-dev bzip2 gzip \
     coreutils ntp language-pack-zh-hans libsm6 libxext6 libxrender-dev libgl1-mesa-glx \
     bison graphviz libjpeg-dev zlib1g-dev automake locales swig net-tools libtool kmod
 <install_cpu_package>
 
 # Downgrade gcc&&g++
-WORKDIR /usr/bin 
-COPY tools/dockerfile/build_scripts /build_scripts 
+WORKDIR /usr/bin
+COPY tools/dockerfile/build_scripts /build_scripts
 RUN bash /build_scripts/install_trt.sh
 # Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
 # # https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
 # # So install a newer version here.
 RUN bash /build_scripts/install_patchelf.sh
 RUN bash /build_scripts/install_gcc.sh gcc121
-RUN cp gcc gcc.bak && cp g++ g++.bak && rm gcc && rm g++ 
-RUN ln -s /usr/local/gcc-12.1/bin/gcc /usr/local/bin/gcc 
-RUN ln -s /usr/local/gcc-12.1/bin/g++ /usr/local/bin/g++ 
-RUN ln -s /usr/local/gcc-12.1/bin/gcc /usr/bin/gcc 
-RUN ln -s /usr/local/gcc-12.1/bin/g++ /usr/bin/g++ 
-ENV PATH=/usr/local/gcc-12.1/bin:$PATH 
+RUN cp gcc gcc.bak && cp g++ g++.bak && rm gcc && rm g++
+RUN ln -s /usr/local/gcc-12.1/bin/gcc /usr/local/bin/gcc
+RUN ln -s /usr/local/gcc-12.1/bin/g++ /usr/local/bin/g++
+RUN ln -s /usr/local/gcc-12.1/bin/gcc /usr/bin/gcc
+RUN ln -s /usr/local/gcc-12.1/bin/g++ /usr/bin/g++
+ENV PATH=/usr/local/gcc-12.1/bin:$PATH
 
 RUN bash /build_scripts/install_cudnn.sh cudnn841
 ENV CUDNN_VERSION=8.4.1
@@ -79,8 +79,8 @@ RUN rm setuptools-68.2.2.tar.gz pip-23.3.1.tar.gz && \
     rm -r setuptools-68.2.2 pip-23.3.1
 
 # remove them when apt-get support 2.27 and higher version
-RUN wget -q https://ftp.gnu.org/gnu/binutils/binutils-2.33.1.tar.gz && \ 
-    tar -xzf binutils-2.33.1.tar.gz && \ 
+RUN wget -q https://ftp.gnu.org/gnu/binutils/binutils-2.33.1.tar.gz && \
+    tar -xzf binutils-2.33.1.tar.gz && \
     cd binutils-2.33.1 && \
     ./configure && make -j && make install && cd .. && rm -rf binutils-2.33.1 binutils-2.33.1.tar.gz
 
diff --git a/tools/dockerfile/build_scripts/build.sh b/tools/dockerfile/build_scripts/build.sh
index cb17a76a1dd05..402111b38e163 100644
--- a/tools/dockerfile/build_scripts/build.sh
+++ b/tools/dockerfile/build_scripts/build.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/tools/dockerfile/build_scripts/build_utils.sh b/tools/dockerfile/build_scripts/build_utils.sh
index 10088cd2c5b02..2d5d35754551c 100755
--- a/tools/dockerfile/build_scripts/build_utils.sh
+++ b/tools/dockerfile/build_scripts/build_utils.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -144,7 +144,7 @@ function do_openssl_build {
     ./config -fPIC --prefix=/usr/local/ssl > /dev/null
     make > /dev/null
     make install > /dev/null
-    
+
 }
 
 
diff --git a/tools/dockerfile/build_scripts/install_cudnn.sh b/tools/dockerfile/build_scripts/install_cudnn.sh
index 78f03766c6fcf..402122dc205de 100644
--- a/tools/dockerfile/build_scripts/install_cudnn.sh
+++ b/tools/dockerfile/build_scripts/install_cudnn.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/tools/dockerfile/build_scripts/install_gcc.sh b/tools/dockerfile/build_scripts/install_gcc.sh
index 4451e2783bb6b..5adbdd0faa2ac 100644
--- a/tools/dockerfile/build_scripts/install_gcc.sh
+++ b/tools/dockerfile/build_scripts/install_gcc.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -30,7 +30,7 @@ else
 fi
 
 if [ "$1" == "gcc82" ]; then
-  wget -q --no-proxy https://paddle-ci.gz.bcebos.com/gcc-8.2.0.tar.xz 
+  wget -q --no-proxy https://paddle-ci.gz.bcebos.com/gcc-8.2.0.tar.xz
   tar -xf gcc-8.2.0.tar.xz && \
   cd gcc-8.2.0 && \
   wget -q --no-proxy https://paddle-ci.gz.bcebos.com/sanitizer_platform_limits_posix.cc.patch
@@ -44,7 +44,7 @@ if [ "$1" == "gcc82" ]; then
   make -j8 && make install
   cd .. && rm -rf temp_gcc82 gcc-8.2.0 gcc-8.2.0.tar.xz
   if [ -f "/etc/redhat-release" ];then
-    cp ${lib_so_6} ${lib_so_6}.bak  && rm -f ${lib_so_6} && 
+    cp ${lib_so_6} ${lib_so_6}.bak  && rm -f ${lib_so_6} &&
     ln -s /usr/local/gcc-8.2/lib64/libgfortran.so.5 ${lib_so_5} && \
     ln -s /usr/local/gcc-8.2/lib64/libstdc++.so.6 ${lib_so_6} && \
     cp /usr/local/gcc-8.2/lib64/libstdc++.so.6.0.25 ${lib_path}
diff --git a/tools/dockerfile/build_scripts/install_nccl2.sh b/tools/dockerfile/build_scripts/install_nccl2.sh
index 2680910834023..b279e0e6f094d 100644
--- a/tools/dockerfile/build_scripts/install_nccl2.sh
+++ b/tools/dockerfile/build_scripts/install_nccl2.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/tools/dockerfile/build_scripts/install_patchelf.sh b/tools/dockerfile/build_scripts/install_patchelf.sh
index ef6b05ec02468..bdcebff0f3690 100644
--- a/tools/dockerfile/build_scripts/install_patchelf.sh
+++ b/tools/dockerfile/build_scripts/install_patchelf.sh
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -34,4 +34,4 @@ make
 make install
 
 cd ..
-rm -rf "$TMP_DIR" 
+rm -rf "$TMP_DIR"
diff --git a/tools/dockerfile/build_scripts/install_trt.sh b/tools/dockerfile/build_scripts/install_trt.sh
index 0fdadc8be8d70..6a35d1bfdce38 100644
--- a/tools/dockerfile/build_scripts/install_trt.sh
+++ b/tools/dockerfile/build_scripts/install_trt.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -30,7 +30,7 @@ if [[ "$1" == "trt8034" && "$VERSION" == "11.2" ]];then
   wget -q --no-proxy https://paddle-ci.gz.bcebos.com/TRT/TensorRT-8.0.3.4.Linux.x86_64-gnu.cuda-11.3.cudnn8.2.tar.gz --no-check-certificate
   tar -zxf TensorRT-8.0.3.4.Linux.x86_64-gnu.cuda-11.3.cudnn8.2.tar.gz -C /usr/local
   cp -rf /usr/local/TensorRT-8.0.3.4/include/* /usr/include/ && cp -rf /usr/local/TensorRT-8.0.3.4/lib/* /usr/lib/
-  rm TensorRT-8.0.3.4.Linux.x86_64-gnu.cuda-11.3.cudnn8.2.tar.gz  
+  rm TensorRT-8.0.3.4.Linux.x86_64-gnu.cuda-11.3.cudnn8.2.tar.gz
 elif [[ "$1" == "trt8424" ]];then
    wget https://paddle-qa.bj.bcebos.com/nvidia/trt/TensorRT-8.4.2.4.tgz --no-check-certificate
    tar -zxf TensorRT-8.4.2.4.tgz -C /usr/local
diff --git a/tools/dockerfile/centos7_manylinux.sh b/tools/dockerfile/centos7_manylinux.sh
index 09793d8843226..38c443bde9cb8 100755
--- a/tools/dockerfile/centos7_manylinux.sh
+++ b/tools/dockerfile/centos7_manylinux.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -61,11 +61,11 @@ function make_cuda123cudnn900trt8616() {
 }
 
 function main() {
-  local CMD=$1 
+  local CMD=$1
   case $CMD in
     cuda112cudnn821trt8034)
       make_cuda112cudnn821trt8034
-     ;; 
+     ;;
     cuda116cudnn840trt8406)
       make_cuda116cudnn840trt8406
      ;;
diff --git a/tools/dockerfile/ci_dockerfile.sh b/tools/dockerfile/ci_dockerfile.sh
index f1a1db3773b91..cdae7c0c2fe66 100644
--- a/tools/dockerfile/ci_dockerfile.sh
+++ b/tools/dockerfile/ci_dockerfile.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -55,7 +55,7 @@ function make_ce_framework_dockcerfile(){
   sed -i 's#<install_cpu_package>##g' ${dockerfile_name}
   sed -i "7i RUN chmod 777 /tmp" ${dockerfile_name}
   sed -i "${dockerfile_line}i RUN wget --no-check-certificate -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz \&\& \
-     tar -xzf  hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name} 
+     tar -xzf  hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name}
   sed -i "${dockerfile_line}i RUN apt-get update && apt install -y zstd pigz libcurl4-openssl-dev gettext ninja-build" ${dockerfile_name}
   sed -i "${dockerfile_line}i RUN pip3.10 install wheel distro" ${dockerfile_name}
   sed -i "${dockerfile_line}i RUN pip3.10 install nvidia-cuda-cupti-cu11==11.8.87 nvidia-cuda-runtime-cu11==11.8.89 nvidia-cudnn-cu11==8.7.0.84 nvidia-cublas-cu11==11.11.3.6 nvidia-cufft-cu11==10.9.0.58 nvidia-curand-cu11==10.3.0.86 nvidia-cusolver-cu11==11.4.1.48 nvidia-cusparse-cu11==11.7.5.86 nvidia-nccl-cu11==2.19.3" ${dockerfile_name}
diff --git a/tools/dockerfile/ubuntu20_dev.sh b/tools/dockerfile/ubuntu20_dev.sh
index 27fe1694287df..ec9d9d9f97e3f 100755
--- a/tools/dockerfile/ubuntu20_dev.sh
+++ b/tools/dockerfile/ubuntu20_dev.sh
@@ -5,9 +5,9 @@
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/tools/dockerfile/ubuntu20_release.sh b/tools/dockerfile/ubuntu20_release.sh
index 8fa08f5326025..e870649e3a695 100755
--- a/tools/dockerfile/ubuntu20_release.sh
+++ b/tools/dockerfile/ubuntu20_release.sh
@@ -5,9 +5,9 @@
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/tools/document_preview.sh b/tools/document_preview.sh
index 47c5207074046..97c01ee96d03b 100755
--- a/tools/document_preview.sh
+++ b/tools/document_preview.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -45,7 +45,7 @@ function get_docs_pr_num_from_paddle_pr_info(){
 }
 
 # Attention:
-# 1. /FluidDoc will be used as the workspace of PaddlePaddle/docs. 
+# 1. /FluidDoc will be used as the workspace of PaddlePaddle/docs.
 # 2. And /docs is used as the output of doc-build process.
 # 3. If conflicted with yours, please modify the definition of FLUIDDOCDIR and
 #    OUTPUTDIR in the subsequent codes.
diff --git a/tools/enforce/count_enforce_by_dir.sh b/tools/enforce/count_enforce_by_dir.sh
index 77ffe9c158c7d..ba419f77f2bc1 100644
--- a/tools/enforce/count_enforce_by_dir.sh
+++ b/tools/enforce/count_enforce_by_dir.sh
@@ -15,10 +15,10 @@
 # limitations under the License.
 
 # This script is used to count detail PADDLE checks in the paddle/fluid directory,
-#   contains the number of PADDLE checks under each folder, the statistical data 
+#   contains the number of PADDLE checks under each folder, the statistical data
 #   does not include subdirectories, only covers all files under the current directory.
-#   
-#   The three columns of data are: total number, valid number, invalid number. 
+#
+#   The three columns of data are: total number, valid number, invalid number.
 #   The output format is easy to display as a markdown table.
 
 # Usage: bash count_enforce_by_dir.sh (run in tools directory)
@@ -70,8 +70,8 @@ function count_dir_independently(){
             enforce_count $1"/"$file dir_total_check_cnt dir_valid_check_cnt
             sub_dir_total_check_cnt=$(($sub_dir_total_check_cnt+$dir_total_check_cnt))
             sub_dir_valid_check_cnt=$(($sub_dir_valid_check_cnt+$dir_valid_check_cnt))
-            
-            count_dir_independently $1"/"$file $dir_total_check_cnt $dir_valid_check_cnt 
+
+            count_dir_independently $1"/"$file $dir_total_check_cnt $dir_valid_check_cnt
         fi
     done
     total_check_cnt=$(($2-$sub_dir_total_check_cnt))
diff --git a/tools/enforce/count_enforce_by_file.sh b/tools/enforce/count_enforce_by_file.sh
index c79d486c62838..b06514a4e03bb 100644
--- a/tools/enforce/count_enforce_by_file.sh
+++ b/tools/enforce/count_enforce_by_file.sh
@@ -16,8 +16,8 @@
 
 # This script is used to count PADDLE checks by files in the paddle/fluid/operators directory,
 #   contains the number of PADDLE checks under each file.
-#   
-#   The three columns of data are: total number, valid number, invalid number. 
+#
+#   The three columns of data are: total number, valid number, invalid number.
 #   The output format is easy to display as a markdown table.
 
 # Usage: bash count_enforce_by_file.sh  [target directory or file] (run in tools directory)
diff --git a/tools/externalError/start.sh b/tools/externalError/start.sh
index d60a26d157cce..057a67ef46a41 100644
--- a/tools/externalError/start.sh
+++ b/tools/externalError/start.sh
@@ -1,13 +1,13 @@
 #!/usr/bin/env bash
 
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/tools/final_ut_parallel_rule.py b/tools/final_ut_parallel_rule.py
index e0fc86c19a8cc..2dbfdd39c1a2c 100644
--- a/tools/final_ut_parallel_rule.py
+++ b/tools/final_ut_parallel_rule.py
@@ -19,7 +19,7 @@
 
 def classify_cases_by_mem(rootPath):
     """classify cases by mem"""
-    case_filename = '%s/build/classify_case_by_cardNum.txt' % rootPath
+    case_filename = f'{rootPath}/build/classify_case_by_cardNum.txt'
     case_exec_100 = [
         'test_conv_eltwiseadd_bn_fuse_pass',
         'test_trt_convert_pool2d',
@@ -124,14 +124,14 @@ def classify_cases_by_mem(rootPath):
             else:
                 case_mem_1[case] = new_lastest_mem[case]["mem_nvidia"]
 
-        with open('/pre_test/%s_mem0' % cardType, 'w') as f:
+        with open(f'/pre_test/{cardType}_mem0', 'w') as f:
             f.write(case_mem_0)
             f.close()
 
         case_mem_1_sort = sorted(case_mem_1.items(), key=lambda x: x[1])
         case_mem_1_line = '^job$'
         mem_1_sum = 0
-        with open('/pre_test/%s' % cardType, 'w') as f_not_0:
+        with open(f'/pre_test/{cardType}', 'w') as f_not_0:
             for index in case_mem_1_sort:
                 if mem_1_sum < 14 * 1024 * 2:
                     mem_1_sum += index[1]
@@ -150,7 +150,7 @@ def classify_cases_by_mem(rootPath):
                     f_not_0.write(case_mem_1_line + '\n')
             f_not_0.close()
 
-    os.system('cp %s/build/nightly_case /pre_test/' % rootPath)
+    os.system(f'cp {rootPath}/build/nightly_case /pre_test/')
 
 
 if __name__ == '__main__':
diff --git a/tools/gen_alias_mapping.sh b/tools/gen_alias_mapping.sh
index 3ab1e68b37557..c57f3f6bba2b1 100755
--- a/tools/gen_alias_mapping.sh
+++ b/tools/gen_alias_mapping.sh
@@ -17,16 +17,16 @@
 # Brief:
 #     This code is used for generating the mapping list of Paddle API alias.
 #     Only the APIs set with the `DEFINE_ALIAS` flag is enable.
-# 
+#
 # Arguments:
 #     None
-# 
+#
 # Usage:
-#     Go into the `Paddle` folder and just run `./tools/gen_alias_mapping.sh`     
+#     Go into the `Paddle` folder and just run `./tools/gen_alias_mapping.sh`
 #
 # Returns:
 #     succ: 0
-# 
+#
 #     Will also print the mapping list to stdout. The format of each line is as below:
 #         <real API implement>\t<API recommend>,<API other alias name1>,<API other alias name2>,...
 
@@ -38,7 +38,7 @@ find ${PADDLE_ROOT}/python/ -name '*.py' \
     | grep 'DEFINE_ALIAS' \
     | perl -ne '
         if (/\/python\/(.*):from (\.*)(\w.*) import (.*?)\s+#DEFINE_ALIAS\s+$/) {
-            my @arr = split(", ", $4); 
+            my @arr = split(", ", $4);
             foreach $i (@arr) {
                 printf "%s|%s|%s|%d\n", $3, $i, substr($1, 0, -3), length($2);
             }
@@ -66,7 +66,7 @@ find ${PADDLE_ROOT}/python/ -name '*.py' \
             }
             key = key""new;
             n2o[key] = val;
-        } 
+        }
         END {
             for (new in n2o) {
                 old = n2o[new] in n2o ? n2o[n2o[new]] : n2o[new];
@@ -78,7 +78,7 @@ find ${PADDLE_ROOT}/python/ -name '*.py' \
         {
             o2n[$1] = o2n[$1] ? o2n[$1]","$3 : $3;
         }
-        END { 
+        END {
             for (i in o2n) {
                 print i"\t"o2n[i];
             }
diff --git a/tools/gen_tensor_stub.py b/tools/gen_tensor_stub.py
index 00c7fb0c2e50c..422b3004f5266 100644
--- a/tools/gen_tensor_stub.py
+++ b/tools/gen_tensor_stub.py
@@ -15,17 +15,18 @@
 from __future__ import annotations
 
 import argparse
+import importlib
 import inspect
 import logging
 import re
+import sys
+import types
 from dataclasses import dataclass
 from functools import cached_property, lru_cache
 from typing import Any, Callable, Literal
 
 from typing_extensions import TypeAlias
 
-import paddle
-
 logging.basicConfig(style="{", format="{message}", level=logging.INFO)
 logger = logging.getLogger("Generating stub file for paddle.Tensor")
 logger.setLevel(logging.INFO)
@@ -102,7 +103,6 @@ def find_apis(self, api_name: str) -> list[dict[str, tuple[str, int, int]]]:
         api = []
         for mo in pattern.finditer(self._template):
             _indent = mo.group('indent')
-            _def_api = mo.group('def_api')
             _signature = mo.group('signature')
             _docstring = mo.group('docstring')
             _ellipsis = mo.group('ellipsis')
@@ -110,26 +110,15 @@ def find_apis(self, api_name: str) -> list[dict[str, tuple[str, int, int]]]:
             _comment = '' if _comment is None else _comment
 
             _start_index, _end_index = mo.span()
-
-            _start_indent = _start_index
-            _end_indent = _start_indent + len(_indent)
-
-            _start_def_api = _end_indent
-            _end_def_api = _start_def_api + len(_def_api)
-
-            _start_signature = _end_def_api
-            _end_signature = _start_signature + len(_signature)
-
-            _start_docstring = _end_signature
-            _end_docstring = _start_docstring + len(_docstring)
-
-            _start_ellipsis = _end_docstring
-            _end_ellipsis = _start_ellipsis + len(_ellipsis)
-
+            _start_indent, _end_indent = mo.span('indent')
+            _start_signature, _end_signature = mo.span('signature')
+            _start_docstring, _end_docstring = mo.span('docstring')
+            _start_ellipsis, _end_ellipsis = mo.span('ellipsis')
             _start_comment = _end_ellipsis
             _end_comment = _start_comment + len(_comment)
 
-            assert _end_index == _end_comment
+            assert _start_index == _start_indent
+            assert _end_comment == _end_index
 
             _api = {
                 'indent': (_indent, _start_indent, _end_indent),
@@ -216,7 +205,10 @@ def add_doc(self, doc: str):
         self.insert_template(docstring, _end_index, _end_index)
 
     def codegen(self) -> str:
-        return self._template
+        header = (
+            '# This file is auto generated by `tools/gen_tensor_stub.py`.\n\n'
+        )
+        return header + self._template
 
 
 def is_inherited_member(name: str, cls: type) -> bool:
@@ -336,7 +328,27 @@ def func_doc_to_method_doc(func_doc: str) -> str:
     return method_doc
 
 
+def try_import_paddle() -> types.ModuleType | None:
+    try:
+        return importlib.import_module('paddle')
+    except ModuleNotFoundError:
+        sys.stderr.write(
+            '''ERROR: Can NOT import paddle.
+            We could import paddle without installation, with all libs (.dll or .so) copied into dir `paddle/libs`,
+            or path already been set for the system.
+            '''
+        )
+
+
 def get_tensor_members():
+    paddle = try_import_paddle()
+    if not paddle:
+        raise (
+            ModuleNotFoundError(
+                'Can NOT import paddle from tools/gen_tensor_stub.py.'
+            )
+        )
+
     tensor_class = paddle.Tensor
 
     members: dict[int, Member] = {}
@@ -433,7 +445,7 @@ def get_tensor_template(path: str) -> str:
         return ''.join(f.readlines())
 
 
-def main():
+def parse_args():
     parser = argparse.ArgumentParser()
 
     parser.add_argument(
@@ -442,7 +454,6 @@ def main():
         type=str,
         default="python/paddle/tensor/tensor.prototype.pyi",
     )
-
     parser.add_argument(
         "-o",
         "--output-file",
@@ -452,12 +463,16 @@ def main():
 
     args = parser.parse_args()
 
+    return args
+
+
+def generate_stub_file(input_file=None, output_file=None):
     # Get members of Tensor
     tensor_members = get_tensor_members()
     logging.debug(f'total members in Tensor: {len(tensor_members)}')
 
     # Get tensor template
-    tensor_template = get_tensor_template(args.input_file)
+    tensor_template = get_tensor_template(input_file)
 
     # Generate the Tensor stub
     tensor_gen = TensorGen(tensor_template)
@@ -473,9 +488,14 @@ def main():
             tensor_gen.add_doc(member.doc)
 
     # Write to target file
-    with open(args.output_file, "w", encoding="utf-8") as f:
+    with open(output_file, "w", encoding="utf-8") as f:
         f.write(tensor_gen.codegen())
 
 
+def main():
+    args = parse_args()
+    generate_stub_file(args.input_file, args.output_file)
+
+
 if __name__ == "__main__":
     main()
diff --git a/tools/get_build_time.sh b/tools/get_build_time.sh
index 496c8c12d6ca3..85100bb50c761 100755
--- a/tools/get_build_time.sh
+++ b/tools/get_build_time.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/tools/get_cpu_info.sh b/tools/get_cpu_info.sh
index bce338a8619e6..b7ec2e77a3a84 100755
--- a/tools/get_cpu_info.sh
+++ b/tools/get_cpu_info.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -54,7 +54,7 @@ echo "OS Version             : `uname -o`"
 echo "Kernel Release Version : `uname -r`"
 echo "Kernel Patch Version   : `uname -v`"
 echo "GCC Version            :`gcc --version | head -n 1|awk -F '\\\(GCC\\\)' '{print $2}'`"
-if command -v cmake >/dev/null 2>&1; then 
+if command -v cmake >/dev/null 2>&1; then
   cmake_ver=`cmake --version | head -n 1 | awk -F 'version' '{print $2}'`
 else
   cmake_ver=" Not installed"
diff --git a/tools/get_op_list.sh b/tools/get_op_list.sh
index 2e4cad13582df..2b5d7f419b1d2 100644
--- a/tools/get_op_list.sh
+++ b/tools/get_op_list.sh
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/tools/get_ut_file_map.py b/tools/get_ut_file_map.py
index bf469eab98747..42b1c251f19a1 100644
--- a/tools/get_ut_file_map.py
+++ b/tools/get_ut_file_map.py
@@ -19,8 +19,8 @@
 
 def get_all_paddle_file(rootPath):
     """get all file in Paddle repo: paddle/fluild, python"""
-    traverse_files = ['%s' % rootPath]
-    all_file_paddle = '%s/build/all_file_paddle' % rootPath
+    traverse_files = [f'{rootPath}']
+    all_file_paddle = f'{rootPath}/build/all_file_paddle'
     all_file_paddle_list = []
     with open(all_file_paddle, 'w') as f:
         for filename in traverse_files:
@@ -32,7 +32,7 @@ def get_all_paddle_file(rootPath):
 
 
 def get_all_uts(rootPath):
-    all_uts_paddle = '%s/build/all_uts_paddle' % rootPath
+    all_uts_paddle = f'{rootPath}/build/all_uts_paddle'
     os.system(
         fr'cd {rootPath}/build && ctest -N -V | grep -Ei "Test[ \t]+#" | grep -oEi "\w+$" > {all_uts_paddle}'
     )
@@ -42,28 +42,28 @@ def remove_useless_file(rootPath):
     """remove useless file in ut_file_map.json"""
     all_file_paddle_list = get_all_paddle_file(rootPath)
     ut_file_map_new = {}
-    ut_file_map = "%s/build/ut_file_map.json" % rootPath
+    ut_file_map = f"{rootPath}/build/ut_file_map.json"
     with open(ut_file_map, 'r') as load_f:
         load_dict = json.load(load_f)
     for key in load_dict:
         if key in all_file_paddle_list:
             ut_file_map_new[key] = load_dict[key]
 
-    with open("%s/build/ut_file_map.json" % rootPath, "w") as f:
+    with open(f"{rootPath}/build/ut_file_map.json", "w") as f:
         json.dump(ut_file_map_new, f, indent=4)
         print("remove_useless_file ut_file_map success!!")
 
 
 def handle_ut_file_map(rootPath):
     utNotSuccess_list = []
-    ut_map_path = "%s/build/ut_map" % rootPath
+    ut_map_path = f"{rootPath}/build/ut_map"
     files = os.listdir(ut_map_path)
     ut_file_map = {}
     count = 0
-    not_success_file = open("%s/build/prec_delta" % rootPath, 'w')
+    not_success_file = open(f"{rootPath}/build/prec_delta", 'w')
     # if testdir is not made,write the test into prec_delta
     get_all_uts(rootPath)
-    all_ut = '%s/build/all_uts_paddle' % rootPath
+    all_ut = f'{rootPath}/build/all_uts_paddle'
     with open(all_ut, 'r') as f:
         all_ut_list = []
         for ut in f.readlines():
@@ -73,7 +73,7 @@ def handle_ut_file_map(rootPath):
     for ut in all_ut_list:
         filedir = f'{rootPath}/build/ut_map/{ut}'
         if not os.path.exists(filedir):
-            not_success_file.write('%s\n' % ut)
+            not_success_file.write(f'{ut}\n')
             utNotSuccess_list.append(ut)
     # if fnda.tmp not exists,write the test into prec_delta
     for ut in files:
@@ -108,7 +108,7 @@ def handle_ut_file_map(rootPath):
                     ut_file_map[source_file].append(ut)
             f.close()
         else:
-            not_success_file.write('%s\n' % ut)
+            not_success_file.write(f'{ut}\n')
             utNotSuccess_list.append(ut)
     not_success_file.close()
 
@@ -135,13 +135,13 @@ def handle_ut_file_map(rootPath):
                 if source_file not in ut_file_map:
                     ut_file_map[source_file] = []
             f.close()
-    with open("%s/build/ut_file_map.json" % rootPath, "w") as f:
+    with open(f"{rootPath}/build/ut_file_map.json", "w") as f:
         json.dump(ut_file_map, f, indent=4)
 
 
 def notsuccessfuc(rootPath):
     utNotSuccess = ''
-    ut_map_path = "%s/build/ut_map" % rootPath
+    ut_map_path = f"{rootPath}/build/ut_map"
     files = os.listdir(ut_map_path)
     count = 0
 
@@ -154,7 +154,7 @@ def notsuccessfuc(rootPath):
             pass
         else:
             count = count + 1
-            utNotSuccess = utNotSuccess + '^%s$|' % ut
+            utNotSuccess = utNotSuccess + f'^{ut}$|'
 
     # ut not exec
 
@@ -166,7 +166,7 @@ def notsuccessfuc(rootPath):
         if ut not in files:
             print(ut)
             count = count + 1
-            utNotSuccess = utNotSuccess + '^%s$|' % ut
+            utNotSuccess = utNotSuccess + f'^{ut}$|'
 
     if utNotSuccess != '':
         print("utNotSuccess count: %s" % count)
@@ -176,18 +176,17 @@ def notsuccessfuc(rootPath):
 
 
 def ut_file_map_supplement(rootPath):
-    ut_file_map_new = "%s/build/ut_file_map.json" % rootPath
+    ut_file_map_new = f"{rootPath}/build/ut_file_map.json"
     precision_test_map_store_dir = "/precision_test_map_store"
-    os.system('mkdir %s' % precision_test_map_store_dir)
+    os.system(f'mkdir {precision_test_map_store_dir}')
     os.system(
-        'cd %s && wget --no-proxy https://paddle-docker-tar.bj.bcebos.com/tmp_test/ut_file_map.json --no-check-certificate'
-        % precision_test_map_store_dir
+        f'cd {precision_test_map_store_dir} && wget --no-proxy https://paddle-docker-tar.bj.bcebos.com/tmp_test/ut_file_map.json --no-check-certificate'
     )
-    ut_file_map_old = "%s/ut_file_map.json" % precision_test_map_store_dir
+    ut_file_map_old = f"{precision_test_map_store_dir}/ut_file_map.json"
     with open(ut_file_map_new, 'r') as load_f:
         load_dict_new = json.load(load_f)
 
-    all_uts_paddle = '%s/build/all_uts_paddle' % rootPath
+    all_uts_paddle = f'{rootPath}/build/all_uts_paddle'
 
     with open(all_uts_paddle, 'r') as f:
         all_uts_paddle_list = []
@@ -195,15 +194,14 @@ def ut_file_map_supplement(rootPath):
             all_uts_paddle_list.append(ut.strip())
         f.close()
 
-    with open("%s/ut_file_map.json" % precision_test_map_store_dir, "w") as f:
+    with open(f"{precision_test_map_store_dir}/ut_file_map.json", "w") as f:
         json.dump(load_dict_new, f, indent=4)
         print("load_dict_new success!!")
 
     os.system(
-        'cd %s && wget --no-proxy https://paddle-docker-tar.bj.bcebos.com/tmp_test/prec_delta --no-check-certificate'
-        % precision_test_map_store_dir
+        f'cd {precision_test_map_store_dir} && wget --no-proxy https://paddle-docker-tar.bj.bcebos.com/tmp_test/prec_delta --no-check-certificate'
     )
-    prec_delta_new = "%s/build/prec_delta" % rootPath
+    prec_delta_new = f"{rootPath}/build/prec_delta"
     with open(prec_delta_new, 'r') as f:
         prec_delta_new_list = []
         for ut in f.readlines():
@@ -212,7 +210,7 @@ def ut_file_map_supplement(rootPath):
     prec_delta_new_list.append(
         'test_py_reader_error_msg'
     )  # add a python case for pycoverage
-    prec_delta_file = open("%s/prec_delta" % precision_test_map_store_dir, 'w')
+    prec_delta_file = open(f"{precision_test_map_store_dir}/prec_delta", 'w')
     for ut in prec_delta_new_list:
         prec_delta_file.write(ut + '\n')
     print("prec_delta_file success!!")
@@ -220,7 +218,7 @@ def ut_file_map_supplement(rootPath):
 
 
 def utmap_analysis(rootPath):
-    ut_file_map_new = "%s/build/ut_file_map.json" % rootPath
+    ut_file_map_new = f"{rootPath}/build/ut_file_map.json"
     with open(ut_file_map_new, 'r') as load_f:
         load_dict_new = json.load(load_f)
     print(len(load_dict_new))
diff --git a/tools/gpups_test.sh b/tools/gpups_test.sh
index 07122405a21d7..e0669fb85e658 100644
--- a/tools/gpups_test.sh
+++ b/tools/gpups_test.sh
@@ -1,11 +1,11 @@
 # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -40,7 +40,7 @@ function get_quickly_disable_ut() {
     fi
 }
 
-# disable test: 
+# disable test:
 # test_dygraph_dataparallel_bf16
 # test_dygraph_sharding_stage2_bf16
 # test_dygraph_sharding_stage3_bf16
diff --git a/tools/group_case_for_parallel.py b/tools/group_case_for_parallel.py
index 66187ca4b0607..9af5e084bada2 100644
--- a/tools/group_case_for_parallel.py
+++ b/tools/group_case_for_parallel.py
@@ -40,29 +40,29 @@ def group_case_for_parallel(rootPath):
             )
 
     # get nightly tests
-    nightly_tests_file = open('%s/tools/nightly_case' % rootPath, 'r')
+    nightly_tests_file = open(f'{rootPath}/tools/nightly_case', 'r')
     nightly_tests = nightly_tests_file.read().strip().split('\n')
     nightly_tests_file.close()
 
     parallel_case_file_list = [
-        '%s/tools/single_card_tests_mem0' % rootPath,
-        '%s/tools/single_card_tests' % rootPath,
-        '%s/tools/multiple_card_tests_mem0' % rootPath,
-        '%s/tools/multiple_card_tests' % rootPath,
-        '%s/tools/exclusive_card_tests_mem0' % rootPath,
-        '%s/tools/exclusive_card_tests' % rootPath,
+        f'{rootPath}/tools/single_card_tests_mem0',
+        f'{rootPath}/tools/single_card_tests',
+        f'{rootPath}/tools/multiple_card_tests_mem0',
+        f'{rootPath}/tools/multiple_card_tests',
+        f'{rootPath}/tools/exclusive_card_tests_mem0',
+        f'{rootPath}/tools/exclusive_card_tests',
     ]
-    case_file = '%s/build/ut_list' % rootPath
+    case_file = f'{rootPath}/build/ut_list'
     if os.path.exists(case_file):
         f = open(case_file, 'r')
         all_need_run_cases = f.read().strip().split('\n')
         if len(all_need_run_cases) == 1 and all_need_run_cases[0] == '':
             f.close()
-            case_file = '%s/build/all_ut_list' % rootPath
+            case_file = f'{rootPath}/build/all_ut_list'
             f = open(case_file, 'r')
             all_need_run_cases = f.read().strip().split('\n')
     else:
-        case_file = '%s/build/all_ut_list' % rootPath
+        case_file = f'{rootPath}/build/all_ut_list'
         f = open(case_file, 'r')
         all_need_run_cases = f.read().strip().split('\n')
 
@@ -71,7 +71,7 @@ def group_case_for_parallel(rootPath):
     all_group_case = []
     for filename in parallel_case_file_list:
         fi = open(filename, 'r')
-        new_f = open('%s_new' % filename, 'w')
+        new_f = open(f'{filename}_new', 'w')
         lines = fi.readlines()
         new_case_file_list = []
         for line in lines:
@@ -88,7 +88,7 @@ def group_case_for_parallel(rootPath):
 
         for line in new_case_file_list:
             cases = '$|^'.join(case for case in line)
-            cases = '^job$|^%s$' % cases
+            cases = f'^job$|^{cases}$'
             new_f.write(cases + '\n')
         fi.close()
         new_f.close()
@@ -98,10 +98,10 @@ def group_case_for_parallel(rootPath):
     if len(all_need_run_cases) != 0:
         for case in all_need_run_cases:
             if case not in nightly_tests:
-                cases = cases + '$|^%s' % case
-        cases = '%s$' % cases
+                cases = cases + f'$|^{case}'
+        cases = f'{cases}$'
 
-    new_f = open('%s/tools/no_parallel_case_file' % rootPath, 'w')
+    new_f = open(f'{rootPath}/tools/no_parallel_case_file', 'w')
     new_f.write(cases + '\n')
     new_f.close()
     f.close()
diff --git a/tools/handle_h_cu_file.py b/tools/handle_h_cu_file.py
index 86458045d3de8..656e47fdba896 100644
--- a/tools/handle_h_cu_file.py
+++ b/tools/handle_h_cu_file.py
@@ -43,7 +43,7 @@ def threadPool(threadPoolNum):
 
 
 def get_h_file_md5(rootPath):
-    h_cu_files = '%s/tools/h_cu_files.log' % rootPath
+    h_cu_files = f'{rootPath}/tools/h_cu_files.log'
     f = open(h_cu_files)
     lines = f.readlines()
     for line in lines:
@@ -52,7 +52,7 @@ def get_h_file_md5(rootPath):
 
 
 def insert_pile_to_h_file(rootPath):
-    h_cu_files = '%s/tools/h_cu_files.log' % rootPath
+    h_cu_files = f'{rootPath}/tools/h_cu_files.log'
     f = open(h_cu_files)
     lines = f.readlines()
     for line in lines:
@@ -60,7 +60,7 @@ def insert_pile_to_h_file(rootPath):
         func = line.replace('/', '_').replace('.', '_')
         os.system(f'echo "\n#ifndef _PRECISE{func.upper()}_\n" >> {line}')
         os.system(f'echo "#define _PRECISE{func.upper()}_" >> {line}')
-        os.system('echo "\n#include <cstdio>\n" >> %s' % line)
+        os.system(f'echo "\n#include <cstdio>\n" >> {line}')
         os.system(
             f'echo "__attribute__((constructor)) static void calledFirst{func}()\n{{" >> {line}'
         )
@@ -68,43 +68,40 @@ def insert_pile_to_h_file(rootPath):
             'echo \'    fprintf(stderr,"precise test map fileeee: %%s\\\\n", __FILE__);\n}\' >> %s'
             % line
         )
-        os.system('echo "\n#endif" >> %s' % line)
+        os.system(f'echo "\n#endif" >> {line}')
 
 
 def add_simple_cxx_test(rootPath):
-    variant_test_path = '%s/paddle/utils/variant_test.cc' % rootPath
-    variant_test_cmakeflie_path = '%s/paddle/utils/CMakeLists.txt' % rootPath
+    variant_test_path = f'{rootPath}/paddle/utils/variant_test.cc'
+    variant_test_cmakeflie_path = f'{rootPath}/paddle/utils/CMakeLists.txt'
     if os.path.exists(variant_test_path) and os.path.exists(
         variant_test_cmakeflie_path
     ):
-        simple_test_path = '%s/paddle/utils/simple_precision_test.cc' % rootPath
-        os.system('touch %s' % simple_test_path)
+        simple_test_path = f'{rootPath}/paddle/utils/simple_precision_test.cc'
+        os.system(f'touch {simple_test_path}')
+        os.system(f"echo '#include \"gtest/gtest.h\"\n' >> {simple_test_path}")
         os.system(
-            "echo '#include \"gtest/gtest.h\"\n' >> %s" % simple_test_path
-        )
-        os.system(
-            'echo "TEST(interface_test, type) { }\n" >> %s' % simple_test_path
+            f'echo "TEST(interface_test, type) {{ }}\n" >> {simple_test_path}'
         )
         os.system('echo "cc_test(" >> %s' % variant_test_cmakeflie_path)
         os.system(
-            'echo "  simple_precision_test" >> %s' % variant_test_cmakeflie_path
+            f'echo "  simple_precision_test" >> {variant_test_cmakeflie_path}'
         )
         os.system(
-            'echo "  SRCS simple_precision_test.cc" >> %s'
-            % variant_test_cmakeflie_path
+            f'echo "  SRCS simple_precision_test.cc" >> {variant_test_cmakeflie_path}'
         )
-        os.system('echo "  DEPS gtest)\n" >> %s' % variant_test_cmakeflie_path)
+        os.system(f'echo "  DEPS gtest)\n" >> {variant_test_cmakeflie_path}')
 
 
 def remove_pile_from_h_file(rootPath):
-    h_cu_files = '%s/tools/h_cu_files.log' % rootPath
+    h_cu_files = f'{rootPath}/tools/h_cu_files.log'
     f = open(h_cu_files)
     lines = f.readlines()
     count = 12
     for line in lines:
         line = line.strip()
         while count > 0:
-            os.system("sed -i '$d' %s" % line)
+            os.system(f"sed -i '$d' {line}")
             count = count - 1
         count = 12
 
diff --git a/tools/nvcc_lazy.sh b/tools/nvcc_lazy.sh
index 31e1a44540133..bb851c11df6db 100755
--- a/tools/nvcc_lazy.sh
+++ b/tools/nvcc_lazy.sh
@@ -17,7 +17,7 @@
 echo "#!/usr/bin/env bash" >> $1
 echo "unset GREP_OPTIONS" >> $1
 echo "set -e" >> $1
-echo -e >> $1 
+echo -e >> $1
 echo "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved." >> $1
 echo "#" >> $1
 echo "# Licensed under the Apache License, Version 2.0 (the \"License\");" >> $1
@@ -25,7 +25,7 @@ echo "# you may not use this file except in compliance with the License." >> $1
 echo "# You may obtain a copy of the License at" >> $1
 echo "#" >> $1
 echo "#     http://www.apache.org/licenses/LICENSE-2.0" >> $1
-echo "#" >> $1 
+echo "#" >> $1
 echo "# Unless required by applicable law or agreed to in writing, software" >> $1
 echo "# distributed under the License is distributed on an \"AS IS\" BASIS," >> $1
 echo "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." >> $1
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index 646a40b30dc0c..23fae5e53097d 100755
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -66,7 +66,7 @@
     'test_paddle_inference_api',
     'test_reference_count_pass_last_lived_ops',
     'test_op_support_gpu',
-    'test_conditional_block',
+    'test_conditional_block_deprecated',
     'test_fleet_rolemaker_init',
     'test_pybind_interface',
     'test_io_save_load',
@@ -228,7 +228,7 @@
     'test_analyzer_capi_exp_pd_threads',
     'test_selected_rows',
     'test_fleet_sharding_meta_optimizer',
-    'test_inference_api',
+    'test_inference_api_deprecated',
     'test_data_generator',
     'test_deprecated_memory_optimize_interfaces',
     'test_ir_skip_layernorm_pass',
@@ -445,7 +445,7 @@
     'op_version_registry_test',
     'test_cudnn_placement_pass',
     'cipher_utils_test',
-    'test_program_code',
+    'test_program_code_deprecated',
     'test_save_model_without_var',
     'program_utils_test',
     'test_fleet_distributed_strategy',
@@ -503,8 +503,8 @@
     'test_dist_fleet_heter_program',
     'test_dist_fleet_ctr',
     'test_collective_allreduce_api',
-    'test_dataloader_unkeep_order',
-    'test_dataloader_keep_order',
+    'test_dataloader_unkeep_order_deprecated',
+    'test_dataloader_keep_order_deprecated',
     'test_dist_se_resnext_sync',
     'test_dist_fleet_ps6',
     'test_dist_fleet_a_sync_optimizer_auto_async',
@@ -553,7 +553,7 @@
     'test_sync_batch_norm_op',
     'test_dist_mnist_batch_merge',
     'test_fleet_launch_ps',
-    'test_dist_sparse_tensor_load_sgd',
+    'test_dist_sparse_tensor_load_sgd_deprecated',
     'test_dist_fleet_a_sync_optimizer_auto_geo',
     'test_dist_lookup_sparse_table_fuse_ops',
     'test_dist_fleet_a_sync_optimizer_geo_deprecated',
@@ -728,7 +728,7 @@
     'test_cyclic_cifar_dataset',
     'test_dyn_rnn',
     'test_multiclass_nms_op',
-    'test_communicator_geo',
+    'test_communicator_geo_deprecated',
     'test_quant_int8_mobilenetv2_mkldnn',
     'test_analyzer_seq_pool1',
     'test_analyzer_transformer',
@@ -851,7 +851,7 @@
     'test_distribution',
     'test_box_clip_op',
     'custom_tensor_test',
-    'test_dataloader_early_reset',
+    'test_dataloader_early_reset_deprecated',
     'test_gather_nd_op',
     'test_tensor_register_hook',
     'test_retain_graph',
@@ -918,7 +918,7 @@
     'test_nanmedian',
     'test_linear',
     'test_imperative_qat_amp',
-    'test_truncated_gaussian_random_op',
+    'test_truncated_gaussian_random_op_deprecated',
     'test_lstm_cudnn_op',
     'copy_same_tensor_test',
     'test_squeeze2_op',
@@ -942,6 +942,7 @@
     'test_sign_op',
     'selected_rows_functor_gpu_test',
     'test_fleet_base',
+    'test_fleet_base_deprecated',
     'test_logsumexp',
     'test_detection',
     'test_image_classification_fp16',
@@ -1080,7 +1081,7 @@
     'test_prelu_op',
     'test_l1_norm_op',
     'test_rmsprop_op',
-    'test_fuse_bn_act_pass',
+    'test_fuse_bn_act_pass_deprecated',
     'test_inplace_addto_strategy',
     'test_paddle_save_load',
     'test_prelu_mkldnn_op',
@@ -1232,7 +1233,7 @@
     'test_memory_reuse_exclude_feed_var',
     'test_polygon_box_transform',
     'math_function_gpu_test',
-    'test_program_prune_backward',
+    'test_program_prune_backward_deprecated',
     'test_ema_fleet',
     'test_fleet_amp_init',
     'test_normalize',
@@ -1265,7 +1266,7 @@
     'test_nn_margin_rank_loss',
     'test_arg_min_max_v2_op',
     'test_variance_layer',
-    'test_quantization_scale_pass',
+    'test_quantization_scale_pass_deprecated',
     'test_segment_ops',
     'test_layers',
     'test_imperative_qat_channelwise',
@@ -1274,7 +1275,7 @@
     'test_l1_loss',
     'test_ifelse',
     'test_cache_program',
-    'test_ir_fc_fuse_pass',
+    'test_ir_fc_fuse_pass_deprecated',
     'test_kldiv_loss_op',
     'test_switch_case',
     'test_unique',
@@ -1332,6 +1333,7 @@
     'test_callbacks',
     'test_imperative_recurrent_usage',
     'test_deform_conv2d',
+    'test_deform_conv2d_deprecated',
     'test_coalesce_tensor_op',
     'test_tsm',
     'test_fused_multihead_matmul_op',
@@ -1382,7 +1384,7 @@
     'test_deformable_psroi_pooling',
     'test_multi_precision_fp16_train',
     'test_adam_op_multi_thread',
-    'test_decoupled_py_reader',
+    'test_decoupled_py_reader_deprecated',
     'test_distribute_fpn_proposals_op',
     'transform_test',
     'test_nan_inf',
@@ -1467,11 +1469,11 @@
     'test_trt_matmul',
     'test_trt_fc_fuse_pass',
     'test_trt_pad_op',
-    'test_imperative_lod_tensor_to_selected_rows',
+    'test_imperative_lod_tensor_to_selected_rows_deprecated',
     'test_gru_unit_op',
     'test_amp_check_finite_and_scale_op',
     'test_imperative_selected_rows_to_lod_tensor',
-    'test_add_reader_dependency',
+    'test_add_reader_dependency_deprecated',
     'test_imperative_transformer_sorted_gradient',
     'test_bicubic_interp_v2_op',
     'test_rank_attention_op',
@@ -1693,7 +1695,7 @@
     'test_protobuf',
     'test_progressbar',
     'test_program_to_string',
-    'test_program_code',
+    'test_program_code_deprecated',
     'test_program',
     'test_precision_recall_op',
     'test_post_training_quantization_resnet50',
@@ -1755,7 +1757,7 @@
     'test_infer_shape',
     'test_infer_no_need_buffer_slots',
     'test_inference_model_io',
-    'test_inference_api',
+    'test_inference_api_deprecated',
     'test_imperative_signal_handler',
     'test_imperative_numpy_bridge',
     'test_imperative_group',
@@ -1880,7 +1882,7 @@
     'test_conv2d_bf16_mkldnn_op',
     'test_context_manager',
     'test_const_value',
-    'test_conditional_block',
+    'test_conditional_block_deprecated',
     'test_concat_int8_mkldnn_op',
     'test_concat_bf16_mkldnn_op',
     'test_compat',
@@ -1996,7 +1998,7 @@
     'test_dist_fleet_ps_gpu_ctr',
     'test_dist_mnist_backward_deps',
     'test_dist_fleet_heter_base',
-    'test_dist_sparse_tensor_load_sgd',
+    'test_dist_sparse_tensor_load_sgd_deprecated',
     'test_new_group',
     'test_dist_mnist_with_program',
     'test_dist_mnist_pg',
@@ -2087,7 +2089,7 @@
     'test_fake_init_op',
     'brpc_service_sparse_sgd_test',
     'test_tf32_cudnn',
-    'test_communicator_geo',
+    'test_communicator_geo_deprecated',
     'test_fleet_dgc_meta_optimizer',
     'test_fc_fuse_pass_cc',
     'test_communicator_sync',
@@ -2230,6 +2232,7 @@
     'test_analyzer_bert',
     'test_analyzer_googlenet',
     'test_fleet_base',
+    'test_fleet_base_deprecated',
     'test_dgc_momentum_op',
     'test_memcpy_op',
     'test_dgc_op',
@@ -2258,7 +2261,7 @@
     'test_reshape_op',
     'test_fused_transformer_encoder_layer',
     'test_eager_deletion_while_op',
-    'test_dataloader_unkeep_order',
+    'test_dataloader_unkeep_order_deprecated',
     'test_correlation',
     'test_moving_average_abs_max_scale_op',
     'test_flatten_contiguous_range_op',
@@ -2291,7 +2294,7 @@
     'test_imperative_trace_non_persistable_inputs',
     'test_executor_return_tensor_not_overwriting',
     'test_density_prior_box_op',
-    'test_dataloader_keep_order',
+    'test_dataloader_keep_order_deprecated',
     'test_bce_loss',
     'test_fetch_lod_tensor_array',
     'test_smooth_l1_loss',
@@ -2347,6 +2350,7 @@
     'test_diagflat',
     'test_determinant_op',
     'test_deform_conv2d',
+    'test_deform_conv2d_deprecated',
     'test_conv_transpose_nn_grad',
     'test_conj_op',
     'test_complex_reshape',
@@ -2527,7 +2531,7 @@
     'test_logical_op',
     'test_imperative_deepcf',
     'test_cholesky_op',
-    'test_ir_fc_fuse_pass',
+    'test_ir_fc_fuse_pass_deprecated',
     'test_fleet_base_single',
     'test_multiprocess_dataloader_iterable_dataset_dynamic',
     'test_slice_op',
@@ -2576,13 +2580,13 @@
     'test_seqconv_eltadd_relu_fuse_pass',
     'test_analysis_predictor',
     'test_convert_operators',
-    'test_add_reader_dependency',
+    'test_add_reader_dependency_deprecated',
     'test_is_tensor',
     'test_variable',
     'test_save_model_without_var',
     'test_unfold_op',
     'test_conv_bn_fuse_pass',
-    'test_truncated_gaussian_random_op',
+    'test_truncated_gaussian_random_op_deprecated',
     'test_traced_layer_err_msg',
     'test_unique_with_counts',
     'test_auc_single_pred_op',
@@ -2689,7 +2693,7 @@
     'test_imperative_save_load',
     'test_imperative_ptb_rnn_sorted_gradient',
     'test_mul_op',
-    'test_imperative_lod_tensor_to_selected_rows',
+    'test_imperative_lod_tensor_to_selected_rows_deprecated',
     'test_imperative_data_parallel',
     'test_norm_nn_grad',
     'test_im2sequence_op',
@@ -2764,7 +2768,7 @@
     'test_dot_op',
     'test_device',
     'test_imperative_layer_apply',
-    'test_dataloader_early_reset',
+    'test_dataloader_early_reset_deprecated',
     'test_imperative_selected_rows_to_lod_tensor',
     'test_crop_op',
     'test_linear_interp_v2_op',
@@ -2815,7 +2819,7 @@
     'test_sync_batch_norm_op',
     'test_static_save_load',
     'test_coalesce_tensor_op',
-    'test_fuse_bn_act_pass',
+    'test_fuse_bn_act_pass_deprecated',
     'test_shard_index_op',
     'test_cuda_random_seed',
     'test_dequantize_log_op',
@@ -2854,7 +2858,7 @@
     'test_eager_tensor',
     'trt_split_converter_test',
     'test_user_defined_quantization',
-    'test_quantization_scale_pass',
+    'test_quantization_scale_pass_deprecated',
     'feed_forward_test',
     'test_standalone_executor',
     'test_imperative_qat_user_defined',
@@ -2878,7 +2882,7 @@
     'test_tensor_register_hook',
     'test_fused_multihead_matmul_op',
     'test_uniform_random_inplace_op',
-    'test_decoupled_py_reader',
+    'test_decoupled_py_reader_deprecated',
     'test_assign_op',
     'test_trt_instance_norm_op',
     'test_uniform_random_op',
diff --git a/tools/print_signatures.py b/tools/print_signatures.py
index d09a04abd045c..ba3e08b154541 100644
--- a/tools/print_signatures.py
+++ b/tools/print_signatures.py
@@ -15,19 +15,34 @@
 Print all signature of a python module in alphabet order.
 
 Usage:
-    ./print_signature  "paddle.base" > signature.txt
+    python tools/print_signature.py "paddle" > API.spec
 """
 
+from __future__ import annotations
+
 import argparse
 import collections
 import hashlib
 import inspect
 import logging
 import pkgutil
+import re
 import sys
+from typing import Literal
 
 import paddle
 
+SpecFields = Literal[
+    "args",
+    "varargs",
+    "varkw",
+    "defaults",
+    "kwonlyargs",
+    "kwonlydefaults",
+    "annotations",
+    "document",
+]
+
 member_dict = collections.OrderedDict()
 
 visited_modules = set()
@@ -61,21 +76,6 @@ def md5(doc):
     return md5sum
 
 
-def is_primitive(instance):
-    int_types = (int,)
-    pritimitive_types = int_types + (float, str)
-    if isinstance(instance, pritimitive_types):
-        return True
-    elif isinstance(instance, (list, tuple, set)):
-        for obj in instance:
-            if not is_primitive(obj):
-                return False
-
-        return True
-    else:
-        return False
-
-
 ErrorSet = set()
 IdSet = set()
 skiplist = []
@@ -200,9 +200,7 @@ def insert_api_into_dict(full_name, gen_doc_anno=None):
             if gen_doc_anno:
                 api_info_dict[fc_id]["gen_doc_anno"] = gen_doc_anno
             if inspect.isfunction(obj):
-                api_info_dict[fc_id]["signature"] = repr(
-                    inspect.getfullargspec(obj)
-                ).replace('FullArgSpec', 'ArgSpec', 1)
+                api_info_dict[fc_id]["signature"] = inspect.getfullargspec(obj)
         return api_info_dict[fc_id]
 
 
@@ -239,85 +237,6 @@ def process_module(m, attr="__all__"):
     return api_counter
 
 
-def check_public_api():
-    modulelist = [  # npqa
-        paddle,
-        paddle.amp,
-        paddle.nn,
-        paddle.nn.functional,
-        paddle.nn.initializer,
-        paddle.nn.utils,
-        paddle.static,
-        paddle.static.nn,
-        paddle.io,
-        paddle.jit,
-        paddle.metric,
-        paddle.distribution,
-        paddle.optimizer,
-        paddle.optimizer.lr,
-        paddle.regularizer,
-        paddle.text,
-        paddle.utils,
-        paddle.utils.download,
-        paddle.utils.cpp_extension,
-        paddle.sysconfig,
-        paddle.vision,
-        paddle.vision.datasets,
-        paddle.vision.models,
-        paddle.vision.transforms,
-        paddle.vision.ops,
-        paddle.distributed,
-        paddle.distributed.fleet,
-        paddle.distributed.fleet.utils,
-        paddle.distributed.parallel,
-        paddle.distributed.utils,
-        paddle.callbacks,
-        paddle.hub,
-        paddle.autograd,
-        paddle.incubate,
-        paddle.inference,
-        paddle.onnx,
-        paddle.device,
-        paddle.audio,
-        paddle.audio.backends,
-        paddle.audio.datasets,
-        paddle.sparse,
-        paddle.sparse.nn,
-        paddle.sparse.nn.functional,
-    ]
-
-    apinum = 0
-    alldict = {}
-    for module in modulelist:
-        if hasattr(module, '__all__'):
-            old_all = module.__all__
-        else:
-            old_all = []
-            dirall = dir(module)
-            for item in dirall:
-                if item.startswith('__'):
-                    continue
-                old_all.append(item)
-        apinum += len(old_all)
-        alldict.update({module.__name__: old_all})
-
-    old_all = []
-    dirall = dir(paddle.Tensor)
-    for item in dirall:
-        if item.startswith('_'):
-            continue
-        old_all.append(item)
-    apinum += len(old_all)
-    alldict.update({'paddle.Tensor': old_all})
-
-    for module, allapi in alldict.items():
-        for member_name in allapi:
-            cur_name = module + '.' + member_name
-            instance = eval(cur_name)
-            doc_md5 = md5(instance.__doc__)
-            member_dict[cur_name] = f"({cur_name}, ('document', '{doc_md5}'))"
-
-
 def check_allmodule_callable():
     modulelist = [paddle]
     for m in modulelist:
@@ -326,69 +245,89 @@ def check_allmodule_callable():
     return member_dict
 
 
+class ApiSpecFormatter:
+    def __init__(self, show_fields: SpecFields):
+        self.show_fields = show_fields
+
+    def format_spec(self, spec: inspect.FullArgSpec | None) -> str:
+        if spec is None:
+            return "ArgSpec()"
+        inner_str = ", ".join(
+            f"{field}={getattr(spec, field)!r}"
+            for field in spec._fields
+            if field in self.show_fields
+        )
+        return f"ArgSpec({inner_str})"
+
+    def format_doc(self, doc: str) -> str:
+        if "document" not in self.show_fields:
+            return "('document', '**********')"
+        return f"('document', '{md5(doc)}')"
+
+    def format(self, api_name: str, spec: inspect.FullArgSpec, doc: str) -> str:
+        return f"{api_name} ({self.format_spec(spec)}, {self.format_doc(doc)})"
+
+
 def parse_args():
     """
     Parse input arguments
     """
     parser = argparse.ArgumentParser(description='Print Apis Signatures')
-    parser.add_argument('--debug', dest='debug', action="store_true")
+    parser.add_argument('module', type=str, help='module', default='paddle')
     parser.add_argument(
-        '--method',
-        dest='method',
+        '--skipped',
+        dest='skipped',
         type=str,
-        default='get_all_api',
-        help="using get_all_api or from_modulelist",
+        help='Skip Checking submodules, support regex',
+        default=r'paddle\.base\.libpaddle\.(eager|pir)\.ops',
     )
     parser.add_argument(
-        'module', type=str, help='module', default='paddle'
-    )  # not used
-    parser.add_argument(
-        '--skipped',
-        dest='skipped',
+        '--show-fields',
         type=str,
-        help='Skip Checking submodules',
-        default='paddle.base.libpaddle.eager.ops',
+        default="args,varargs,varkw,defaults,kwonlyargs,kwonlydefaults,annotations,document",
+        help="show fields in arg spec, separated by comma, e.g. 'args,varargs'",
     )
-
-    if len(sys.argv) == 1:
-        args = parser.parse_args(['paddle'])
-        return args
-    #    parser.print_help()
-    #    sys.exit(1)
-
     args = parser.parse_args()
     return args
 
 
+def create_api_filter(skipped_regex: str):
+    if not skipped_regex:
+        return lambda api_name: True
+    skipped_pattern = re.compile(skipped_regex)
+
+    def api_filter(api_name: str) -> bool:
+        return not skipped_pattern.match(api_name)
+
+    return api_filter
+
+
 if __name__ == '__main__':
     args = parse_args()
     check_allmodule_callable()
-    if args.method == 'from_modulelist':
-        check_public_api()
-        for name in member_dict:
-            print(name, member_dict[name])
-    elif args.method == 'get_all_api':
-        get_all_api()
-        all_api_names_to_k = {}
-        for k, api_info in api_info_dict.items():
-            # 1. the shortest suggested_name may be renamed;
-            # 2. some api's fullname is not accessable, the module name of it is overrided by the function with the same name;
-            api_name = sorted(api_info['all_names'])[0]
-            all_api_names_to_k[api_name] = k
-        all_api_names_sorted = sorted(all_api_names_to_k.keys())
-        for api_name in all_api_names_sorted:
-            if args.skipped != '' and api_name.find(args.skipped) >= 0:
-                continue
-            api_info = api_info_dict[all_api_names_to_k[api_name]]
-            print(
-                "{} ({}, ('document', '{}'))".format(
-                    api_name,
-                    api_info['signature']
-                    if 'signature' in api_info
-                    else 'ArgSpec()',
-                    md5(api_info['docstring']),
-                )
+    get_all_api(args.module)
+    api_filter = create_api_filter(args.skipped)
+    spec_formatter = ApiSpecFormatter(args.show_fields.split(','))
+
+    all_api_names_to_k = {}
+    for k, api_info in api_info_dict.items():
+        # 1. the shortest suggested_name may be renamed;
+        # 2. some api's fullname is not accessable, the module name of it is overrided by the function with the same name;
+        api_name = sorted(api_info['all_names'])[0]
+        all_api_names_to_k[api_name] = k
+    all_api_names_sorted = sorted(all_api_names_to_k.keys())
+    for api_name in all_api_names_sorted:
+        if not api_filter(api_name):
+            continue
+        api_info = api_info_dict[all_api_names_to_k[api_name]]
+
+        print(
+            spec_formatter.format(
+                api_name,
+                api_info.get('signature'),
+                api_info['docstring'],
             )
+        )
 
     if len(ErrorSet) == 0:
         sys.exit(0)
diff --git a/tools/prune_for_jetson.py b/tools/prune_for_jetson.py
index 12f15a5dec6e1..d3758493d0c00 100644
--- a/tools/prune_for_jetson.py
+++ b/tools/prune_for_jetson.py
@@ -101,9 +101,9 @@ def prune_phi_kernels():
 def apply_patches():
     work_path = os.path.dirname(os.path.abspath(__file__)) + "/../"
     ret = os.system(
-        "cd %s && rm -f paddle/fluid/inference/api/tensorrt_predictor.* "
+        f"cd {work_path} && rm -f paddle/fluid/inference/api/tensorrt_predictor.* "
         " && rm -f paddle/fluid/inference/api/paddle_tensorrt_predictor.h "
-        " && git apply tools/infer_prune_patches/*.patch && cd -" % work_path
+        " && git apply tools/infer_prune_patches/*.patch && cd -"
     )
     return ret == 0
 
@@ -120,7 +120,7 @@ def append_fluid_kernels():
     for op in op_white_list:
         append_str = (
             append_str
-            + "file(APPEND ${pybind_file} \"USE_OP__(%s);\\n\")\n" % op
+            + f"file(APPEND ${{pybind_file}} \"USE_OP__({op});\\n\")\n"
         )
 
     with open(file_name, 'r', encoding='utf-8') as f:
@@ -154,11 +154,9 @@ def append_fluid_kernels():
 
         for op in op_white_list:
             patterns = {
-                "REGISTER_OPERATOR": r"REGISTER_OPERATOR\(\s*%s\s*," % op,
-                "REGISTER_OP_CPU_KERNEL": r"REGISTER_OP_CPU_KERNEL\(\s*%s\s*,"
-                % op,
-                "REGISTER_OP_CUDA_KERNEL": r"REGISTER_OP_CUDA_KERNEL\(\s*%s\s*,"
-                % op,
+                "REGISTER_OPERATOR": rf"REGISTER_OPERATOR\(\s*{op}\s*,",
+                "REGISTER_OP_CPU_KERNEL": rf"REGISTER_OP_CPU_KERNEL\(\s*{op}\s*,",
+                "REGISTER_OP_CUDA_KERNEL": rf"REGISTER_OP_CUDA_KERNEL\(\s*{op}\s*,",
             }
             for k, p in patterns.items():
                 matches = re.findall(p, content, flags=re.DOTALL)
diff --git a/tools/sampcd_processor_utils.py b/tools/sampcd_processor_utils.py
index ff6de2b598326..aaf61fcd88dc0 100644
--- a/tools/sampcd_processor_utils.py
+++ b/tools/sampcd_processor_utils.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import argparse
 import inspect
 import logging
@@ -48,6 +50,12 @@
 API_DIFF_SPEC_FN = 'dev_pr_diff_api.spec'
 TEST_TIMEOUT = 10
 
+PAT_API_SPEC_MEMBER = re.compile(r'\((paddle[^,]+)\W*document\W*([0-9a-z]{32})')
+# insert ArgSpec for changing the API's type annotation can trigger the CI
+PAT_API_SPEC_SIGNATURE = re.compile(
+    r'^(paddle[^,]+)\s+\((ArgSpec.*),.*document\W*([0-9a-z]{32})'
+)
+
 
 class Result:
     # name/key for result
@@ -66,7 +74,7 @@ class Result:
     order: int = 0
 
     @classmethod
-    def msg(cls, count: int, env: typing.Set) -> str:
+    def msg(cls, count: int, env: set) -> str:
         """Message for logging with api `count` and running `env`."""
         raise NotImplementedError
 
@@ -85,8 +93,8 @@ class MetaResult(type):
     def __new__(
         mcs,
         name: str,
-        bases: typing.Tuple[type, ...],
-        namespace: typing.Dict[str, typing.Any],
+        bases: tuple[type, ...],
+        namespace: dict[str, typing.Any],
     ) -> type:
         cls = super().__new__(mcs, name, bases, namespace)
         if issubclass(cls, Result):
@@ -104,7 +112,7 @@ def get(mcs, name: str) -> type:
         return mcs.__cls_map.get(name)
 
     @classmethod
-    def cls_map(mcs) -> typing.Dict[str, Result]:
+    def cls_map(mcs) -> dict[str, Result]:
         return mcs.__cls_map
 
 
@@ -290,7 +298,7 @@ def prepare(self, test_capacity: set) -> None:
         """
         pass
 
-    def run(self, api_name: str, docstring: str) -> typing.List[TestResult]:
+    def run(self, api_name: str, docstring: str) -> list[TestResult]:
         """Extract codeblocks from docstring, and run the test.
         Run only one docstring at a time.
 
@@ -304,7 +312,7 @@ def run(self, api_name: str, docstring: str) -> typing.List[TestResult]:
         raise NotImplementedError
 
     def print_summary(
-        self, test_results: typing.List[TestResult], whl_error: typing.List[str]
+        self, test_results: list[TestResult], whl_error: list[str]
     ) -> None:
         """Post process test results and print test summary.
 
@@ -333,17 +341,17 @@ def get_api_md5(path):
     API_spec = os.path.abspath(os.path.join(os.getcwd(), "..", path))
     if not os.path.isfile(API_spec):
         return api_md5
-    pat = re.compile(r'\((paddle[^,]+)\W*document\W*([0-9a-z]{32})')
-    patArgSpec = re.compile(
-        r'^(paddle[^,]+)\s+\(ArgSpec.*document\W*([0-9a-z]{32})'
-    )
+
     with open(API_spec) as f:
         for line in f.readlines():
-            mo = pat.search(line)
-            if not mo:
-                mo = patArgSpec.search(line)
+            mo = PAT_API_SPEC_MEMBER.search(line)
+
             if mo:
                 api_md5[mo.group(1)] = mo.group(2)
+            else:
+                mo = PAT_API_SPEC_SIGNATURE.search(line)
+                api_md5[mo.group(1)] = f'{mo.group(2)}, {mo.group(3)}'
+
     return api_md5
 
 
@@ -397,18 +405,6 @@ def get_full_api_from_pr_spec():
         get_full_api_by_walk()
 
 
-def get_full_api():
-    """
-    get all the apis
-    """
-    global API_DIFF_SPEC_FN  # readonly
-    from print_signatures import get_all_api_from_modulelist
-
-    member_dict = get_all_api_from_modulelist()
-    with open(API_DIFF_SPEC_FN, 'w') as f:
-        f.write("\n".join(member_dict.keys()))
-
-
 def extract_code_blocks_from_docstr(docstr, google_style=True):
     """
     extract code-blocks from the given docstring.
@@ -599,9 +595,16 @@ def get_test_capacity(run_on_device="cpu"):
     return sample_code_test_capacity
 
 
-def get_docstring(full_test=False):
+def get_docstring(
+    full_test: bool = False,
+    filter_api: typing.Callable[[str], bool] | None = None,
+):
     '''
     this function will get the docstring for test.
+
+    Args:
+        full_test, get all api
+        filter_api, a function that filter api, if `True` then skip add to `docstrings_to_test`.
     '''
     import paddle
     import paddle.static.quantization  # noqa: F401
@@ -616,6 +619,9 @@ def get_docstring(full_test=False):
     with open(API_DIFF_SPEC_FN) as f:
         for line in f.readlines():
             api = line.replace('\n', '')
+            if filter_api is not None and filter_api(api.strip()):
+                continue
+
             try:
                 api_obj = eval(api)
             except AttributeError:
@@ -637,7 +643,7 @@ def get_docstring(full_test=False):
     return docstrings_to_test, whl_error
 
 
-def check_old_style(docstrings_to_test: typing.Dict[str, str]):
+def check_old_style(docstrings_to_test: dict[str, str]):
     old_style_apis = []
     for api_name, raw_docstring in docstrings_to_test.items():
         for codeblock in extract_code_blocks_from_docstr(
@@ -715,8 +721,8 @@ def exec_gen_doc():
 
 
 def get_test_results(
-    doctester: DocTester, docstrings_to_test: typing.Dict[str, str]
-) -> typing.List[TestResult]:
+    doctester: DocTester, docstrings_to_test: dict[str, str]
+) -> list[TestResult]:
     """Get test results from doctester with docstrings to test."""
     _test_style = (
         doctester.style
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
index 8bfb9caef11c2..f63bc17488e77 100755
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -48,7 +48,7 @@
     'test_adaptive_avg_pool1d',
     'test_adaptive_max_pool1d',
     'test_add_position_encoding_op',
-    'test_add_reader_dependency',
+    'test_add_reader_dependency_deprecated',
     'test_addmm_op',
     'test_affine_grid_op',
     'test_allclose_layer',
@@ -96,7 +96,7 @@
     'test_compare_reduce_op',
     'test_compiled_program',
     'test_cond',
-    'test_conditional_block',
+    'test_conditional_block_deprecated',
     'test_context_manager',
     'test_conv1d_layer',
     'test_conv1d_transpose_layer',
@@ -122,12 +122,12 @@
     'test_cumsum_op',
     'test_cvm_op',
     'test_data',
-    'test_dataloader_early_reset',
-    'test_dataloader_keep_order',
-    'test_dataloader_unkeep_order',
+    'test_dataloader_early_reset_deprecated',
+    'test_dataloader_keep_order_deprecated',
+    'test_dataloader_unkeep_order_deprecated',
     'test_debugger',
     'test_decayed_adagrad_op',
-    'test_decoupled_py_reader',
+    'test_decoupled_py_reader_deprecated',
     'test_decoupled_py_reader_data_check',
     'test_deformable_conv_v1_op',
     'test_deformable_psroi_pooling',
@@ -252,7 +252,7 @@
     'test_imperative_gan',
     'test_imperative_gnn',
     'test_imperative_load_static_param',
-    'test_imperative_lod_tensor_to_selected_rows',
+    'test_imperative_lod_tensor_to_selected_rows_deprecated',
     'test_imperative_optimizer',
     'test_imperative_ptb_rnn',
     'test_imperative_ptb_rnn_sorted_gradient',
@@ -372,8 +372,8 @@
     'test_prior_box_op',
     'test_profiler',
     'test_program',
-    'test_program_code',
-    'test_program_prune_backward',
+    'test_program_code_deprecated',
+    'test_program_prune_backward_deprecated',
     'test_program_to_string',
     'test_protobuf_descs',
     'test_proximal_gd_op',
@@ -467,7 +467,7 @@
     'test_tril_triu_op',
     'test_trilinear_interp_op',
     'test_trilinear_interp_v2_op',
-    'test_truncated_gaussian_random_op',
+    'test_truncated_gaussian_random_op_deprecated',
     'test_unbind_op',
     'test_unfold_op',
     'test_uniform_random_bf16_op',
@@ -500,7 +500,7 @@
     'test_communicator_sync',
     'test_collective_optimizer',
     'test_data_norm_op',
-    'test_fuse_bn_act_pass',
+    'test_fuse_bn_act_pass_deprecated',
     'test_layers',
     'test_sequence_conv',
     'test_sequence_erase_op',
@@ -518,7 +518,7 @@
     'test_sequence_topk_avg_pooling',
     'test_sequence_unpad_op',
     'test_ir_embedding_eltwise_layernorm_fuse_pass',
-    'test_ir_fc_fuse_pass',
+    'test_ir_fc_fuse_pass_deprecated',
     'test_ir_skip_layernorm_pass',
     'test_conv_bias_mkldnn_fuse_pass',
     'test_conv_bn_fuse_pass',
@@ -603,6 +603,7 @@
     'test_fused_multihead_matmul_op',
     'test_rank_attention_op',
     'test_fleet_base',
+    'test_fleet_base_deprecated',
     'test_fleet_meta_optimizer_base',
     'test_trt_fc_fuse_pass',
     'test_trt_quant_conv2d_dequant_fuse_pass',
diff --git a/tools/statistics_UT_resource.sh b/tools/statistics_UT_resource.sh
index a6f1f264c4cd2..f97fc6f0dc51d 100644
--- a/tools/statistics_UT_resource.sh
+++ b/tools/statistics_UT_resource.sh
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/tools/test_print_signatures.py b/tools/test_print_signatures.py
index 8a3bc60dcf9a7..20345d77b2566 100644
--- a/tools/test_print_signatures.py
+++ b/tools/test_print_signatures.py
@@ -25,7 +25,7 @@
 import hashlib
 import unittest
 
-from print_signatures import is_primitive, md5
+from print_signatures import md5
 
 
 def func_example(param_a, param_b):
@@ -62,26 +62,5 @@ def test_md5(self):
         self.assertEqual(digest, md5(func_example.__doc__))
 
 
-class Test_is_primitive(unittest.TestCase):
-    def test_single(self):
-        self.assertTrue(is_primitive(2))
-        self.assertTrue(is_primitive(2.1))
-        self.assertTrue(is_primitive("2.1.1"))
-        self.assertFalse(is_primitive(b"hello paddle"))
-        self.assertFalse(is_primitive(1j))
-        self.assertTrue(is_primitive(True))
-
-    def test_collection(self):
-        self.assertTrue(is_primitive([]))
-        self.assertTrue(is_primitive(()))
-        self.assertTrue(is_primitive(set()))
-        self.assertTrue(is_primitive([1, 2]))
-        self.assertTrue(is_primitive((1.1, 2.2)))
-        self.assertTrue(is_primitive({1, 2.3}))
-        self.assertFalse(is_primitive(range(3)))
-        self.assertFalse(is_primitive({}))
-        self.assertFalse(is_primitive([1, 1j]))
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/tools/test_run_by_protobuf_3.py b/tools/test_run_by_protobuf_3.py
index 52ce36683e380..7c353b9145c16 100644
--- a/tools/test_run_by_protobuf_3.py
+++ b/tools/test_run_by_protobuf_3.py
@@ -19,7 +19,7 @@
     'test_ema_fleet',
     'test_fleet_base2',
     'test_fleet_base3',
-    'test_communicator_geo',
+    'test_communicator_geo_deprecated',
     'test_communicator_async',
     'test_dist_fleet_a_sync_optimizer_async',
     'test_dist_fleet_a_sync_optimizer_auto',
@@ -51,7 +51,7 @@
     'test_dist_sparse_tensor_load_ftrl',
     'test_dist_sparse_tensor_load_momentum',
     'test_dist_sparse_tensor_load_rmsprop',
-    'test_dist_sparse_tensor_load_sgd',
+    'test_dist_sparse_tensor_load_sgd_deprecated',
     'test_communicator_sync',
     'test_dist_fuse_adam_pass',
     'test_dist_fuse_bn_act_pass',
diff --git a/tools/test_sampcd_processor.py b/tools/test_sampcd_processor.py
index 62c51a73ba8a7..c61c7e610f98c 100644
--- a/tools/test_sampcd_processor.py
+++ b/tools/test_sampcd_processor.py
@@ -103,19 +103,23 @@ def tearDown(self):
     def test_get_api_md5(self):
         res = get_api_md5('paddle/fluid/API_PR.spec')
         self.assertEqual(
-            "ff0f188c95030158cc6398d2a6c55one", res['paddle.one_plus_one']
+            "ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ff0f188c95030158cc6398d2a6c55one",
+            res['paddle.one_plus_one'],
         )
         self.assertEqual(
-            "ff0f188c95030158cc6398d2a6c55two", res['paddle.two_plus_two']
+            "ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ff0f188c95030158cc6398d2a6c55two",
+            res['paddle.two_plus_two'],
         )
         self.assertEqual(
-            "ff0f188c95030158cc6398d2a6cthree", res['paddle.three_plus_three']
+            "ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ff0f188c95030158cc6398d2a6cthree",
+            res['paddle.three_plus_three'],
         )
         self.assertEqual(
             "ff0f188c95030158cc6398d2a6c5four", res['paddle.four_plus_four']
         )
         self.assertEqual(
-            "ff0f188c95030158cc6398d2a6c5five", res['paddle.five_plus_five']
+            "ArgSpec(), ff0f188c95030158cc6398d2a6c5five",
+            res['paddle.five_plus_five'],
         )
 
 
@@ -302,8 +306,8 @@ def test_global_exec(self):
                     >>> import paddle
                     >>> a = paddle.to_tensor(.2)
                     >>> print(a)
-                    Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
-                    [0.20000000])
+                    Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    0.20000000)
             """,
             'set_default': """
             placeholder
@@ -319,8 +323,8 @@ def test_global_exec(self):
                     >>> paddle.set_default_dtype('float64')
                     >>> a = paddle.to_tensor(.2)
                     >>> print(a)
-                    Tensor(shape=[1], dtype=float64, place=Place(cpu), stop_gradient=True,
-                    [0.20000000])
+                    Tensor(shape=[], dtype=float64, place=Place(cpu), stop_gradient=True,
+                    0.20000000)
             """,
             'after_set_default': """
             placeholder
@@ -335,8 +339,8 @@ def test_global_exec(self):
                     >>> import paddle
                     >>> a = paddle.to_tensor(.2)
                     >>> print(a)
-                    Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
-                    [0.20000000])
+                    Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    0.20000000)
             """,
         }
 
@@ -509,10 +513,10 @@ def test_patch_xdoctest(self):
                     >>> import paddle
                     >>> paddle.device.set_device('gpu')
                     >>> a = paddle.to_tensor(.2)
-                    >>> # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True, [0.20000000])
+                    >>> # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True, 0.20000000)
                     >>> print(a)
-                    Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-                    [0.20000000])
+                    Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+                    0.20000000)
 
             """,
             'cpu_to_cpu': """
@@ -528,10 +532,10 @@ def test_patch_xdoctest(self):
                     >>> import paddle
                     >>> paddle.device.set_device('cpu')
                     >>> a = paddle.to_tensor(.2)
-                    >>> # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True, [0.20000000])
+                    >>> # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True, 0.20000000)
                     >>> print(a)
-                    Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
-                    [0.20000000])
+                    Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    0.20000000)
 
             """,
             'gpu_to_cpu': """
@@ -547,10 +551,10 @@ def test_patch_xdoctest(self):
                     >>> import paddle
                     >>> paddle.device.set_device('gpu')
                     >>> a = paddle.to_tensor(.2)
-                    >>> # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True, [0.20000000])
+                    >>> # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True, 0.20000000)
                     >>> print(a)
-                    Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
-                    [0.20000000])
+                    Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    0.20000000)
 
             """,
             'cpu_to_gpu': """
@@ -566,10 +570,10 @@ def test_patch_xdoctest(self):
                     >>> import paddle
                     >>> paddle.device.set_device('cpu')
                     >>> a = paddle.to_tensor(.2)
-                    >>> # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True, [0.20000000])
+                    >>> # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True, 0.20000000)
                     >>> print(a)
-                    Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-                    [0.20000000])
+                    Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+                    0.20000000)
             """,
             'gpu_to_cpu_array': """
             placeholder
@@ -701,8 +705,8 @@ def test_patch_xdoctest(self):
                     >>> paddle.device.set_device('gpu')
                     >>> a = paddle.to_tensor(.123456789)
                     >>> print(a)
-                    Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-                    [0.123456780])
+                    Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+                    0.123456780)
 
             """,
             'cpu_to_cpu': """
@@ -719,8 +723,8 @@ def test_patch_xdoctest(self):
                     >>> paddle.device.set_device('cpu')
                     >>> a = paddle.to_tensor(.123456789)
                     >>> print(a)
-                    Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
-                    [0.123456780])
+                    Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    0.123456780)
 
             """,
             'gpu_to_cpu': """
@@ -737,8 +741,8 @@ def test_patch_xdoctest(self):
                     >>> paddle.device.set_device('gpu')
                     >>> a = paddle.to_tensor(.123456789)
                     >>> print(a)
-                    Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
-                    [0.123456780])
+                    Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    0.123456780)
 
             """,
             'cpu_to_gpu': """
@@ -755,8 +759,8 @@ def test_patch_xdoctest(self):
                     >>> paddle.device.set_device('cpu')
                     >>> a = paddle.to_tensor(.123456789)
                     >>> print(a)
-                    Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-                    [0.123456780])
+                    Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+                    0.123456780)
             """,
             'gpu_to_cpu_array': """
             placeholder
@@ -2046,7 +2050,7 @@ def test_timeout(self):
 
     def test_bad_statements(self):
         docstrings_to_test = {
-            'bad_fluid': """
+            'good_fluid': """
             this is docstring...
 
             Examples:
@@ -2191,9 +2195,9 @@ def test_bad_statements(self):
             tr_10,
         ) = test_results
 
-        self.assertIn('bad_fluid', tr_0.name)
-        self.assertTrue(tr_0.badstatement)
-        self.assertFalse(tr_0.passed)
+        self.assertIn('good_fluid', tr_0.name)
+        self.assertFalse(tr_0.badstatement)
+        self.assertTrue(tr_0.passed)
 
         self.assertIn('bad_fluid_from', tr_1.name)
         self.assertTrue(tr_1.badstatement)
diff --git a/tools/test_type_checking.py b/tools/test_type_checking.py
new file mode 100644
index 0000000000000..714be765ca9b5
--- /dev/null
+++ b/tools/test_type_checking.py
@@ -0,0 +1,630 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from tools.type_checking import MypyChecker, get_test_results
+
+
+class TestMypyChecker(unittest.TestCase):
+    def test_mypy_pass(self):
+        docstrings_pass = {
+            'simple': """
+            placeholder
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> import abc
+                    >>> print(1)
+                    1
+            """,
+            'multi': """
+            placeholder
+
+            .. code-block:: python
+                :name: code-example-0
+
+                this is some blabla...
+
+                >>> # doctest: +SKIP('skip')
+                >>> print(1+1)
+                2
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> # doctest: -REQUIRES(env:GPU)
+                    >>> print(1-1)
+                    0
+
+                .. code-block:: python
+                    :name: code-example-2
+
+                    this is some blabla...
+
+                    >>> # doctest: +REQUIRES(env:GPU, env:XPU, env: DISTRIBUTED)
+                    >>> print(1-1)
+                    0
+            """,
+        }
+        docstrings_from_sampcd = {
+            'gpu_to_gpu': """
+            placeholder
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> import paddle
+                    >>> paddle.device.set_device('gpu')
+                    >>> a = paddle.to_tensor(.123456789)
+                    >>> print(a)
+                    Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+                    [0.123456780])
+
+            """,
+            'cpu_to_cpu': """
+            placeholder
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> import paddle
+                    >>> paddle.device.set_device('cpu')
+                    >>> a = paddle.to_tensor(.123456789)
+                    >>> print(a)
+                    Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    [0.123456780])
+
+            """,
+            'gpu_to_cpu': """
+            placeholder
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> import paddle
+                    >>> paddle.device.set_device('gpu')
+                    >>> a = paddle.to_tensor(.123456789)
+                    >>> print(a)
+                    Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    [0.123456780])
+
+            """,
+            'cpu_to_gpu': """
+            placeholder
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> import paddle
+                    >>> paddle.device.set_device('cpu')
+                    >>> a = paddle.to_tensor(.123456789)
+                    >>> print(a)
+                    Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+                    [0.123456780])
+            """,
+            'gpu_to_cpu_array': """
+            placeholder
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> import paddle
+                    >>> paddle.device.set_device('gpu')
+                    >>> a = paddle.to_tensor([[1.123456789 ,2,3], [2,3,4], [3,4,5]])
+                    >>> print(a)
+                    Tensor(shape=[3, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    [[1.123456780, 2., 3.],
+                    [2., 3., 4.],
+                    [3., 4., 5.]])
+            """,
+            'cpu_to_gpu_array': """
+            placeholder
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> import paddle
+                    >>> paddle.device.set_device('cpu')
+                    >>> a = paddle.to_tensor([[1.123456789,2,3], [2,3,4], [3,4,5]])
+                    >>> print(a)
+                    Tensor(shape=[3, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+                    [[1.123456780, 2., 3.],
+                    [2., 3., 4.],
+                    [3., 4., 5.]])
+            """,
+            'mass_array': """
+            placeholder
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> import paddle
+                    >>> paddle.device.set_device('gpu')
+                    >>> a = paddle.to_tensor(
+                    ... [[1.123456780, 2., -3, .3],
+                    ... [2, 3, +4., 1.2+10.34e-5j],
+                    ... [3, 5.e-3, 1e2, 3e-8]]
+                    ... )
+                    >>> # Tensor(shape=[3, 4], dtype=complex64, place=Place(gpu:0), stop_gradient=True,
+                    >>> #       [[ (1.1234568357467651+0j)                    ,
+                    >>> #          (2+0j)                                     ,
+                    >>> #         (-3+0j)                                     ,
+                    >>> #          (0.30000001192092896+0j)                   ],
+                    >>> #        [ (2+0j)                                     ,
+                    >>> #          (3+0j)                                     ,
+                    >>> #          (4+0j)                                     ,
+                    >>> #         (1.2000000476837158+0.00010340000153519213j)],
+                    >>> #        [ (3+0j)                                     ,
+                    >>> #          (0.004999999888241291+0j)                  ,
+                    >>> #          (100+0j)                                   ,
+                    >>> #          (2.999999892949745e-08+0j)                 ]])
+                    >>> print(a)
+                    Tensor(shape=[3, 4], dtype=complex64, place=Place(AAA), stop_gradient=True,
+                        [[ (1.123456+0j),
+                            (2+0j),
+                            (-3+0j),
+                            (0.3+0j)],
+                            [ (2+0j),
+                            (3+0j),
+                            (4+0j),
+                            (1.2+0.00010340j)],
+                            [ (3+0j),
+                            (0.00499999+0j),
+                            (100+0j),
+                            (2.999999e-08+0j)]])
+            """,
+            'float_array': """
+            placeholder
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> import paddle
+                    >>> paddle.device.set_device('cpu')
+                    >>> x = [[2, 3, 4], [7, 8, 9]]
+                    >>> x = paddle.to_tensor(x, dtype='float32')
+                    >>> print(paddle.log(x))
+                    Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    [[0.69314718, 1.09861231, 1.38629436],
+                        [1.94591010, 2.07944155, 2.19722462]])
+
+            """,
+            'float_array_diff': """
+            placeholder
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> import paddle
+                    >>> paddle.device.set_device('cpu')
+                    >>> x = [[2, 3, 4], [7, 8, 9]]
+                    >>> x = paddle.to_tensor(x, dtype='float32')
+                    >>> print(paddle.log(x))
+                    Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+                        [[0.69314712, 1.09861221, 1.386294],
+                        [1.94591032, 2.07944156, 2.1972246]])
+
+            """,
+            'float_begin': """
+            placeholder
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> print(7.0)
+                    7.
+
+            """,
+            'float_begin_long': """
+            placeholder
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> print(7.0000023)
+                    7.0000024
+
+            """,
+            'float_begin_more': """
+            placeholder
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> print(7.0, 5., 6.123456)
+                    7.0 5.0 6.123457
+
+            """,
+            'float_begin_more_diff': """
+            placeholder
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> print(7.0, 5., 6.123456)
+                    7.0 5.0 6.123457
+
+            """,
+            'float_begin_more_brief': """
+            placeholder
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> print(7.0, 5., 6.123456)
+                    7. 5. 6.123457
+
+            """,
+            'float_begin_fail': """
+            placeholder
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> print(7.0100023)
+                    7.0000024
+
+            """,
+        }
+        doctester = MypyChecker('../pyproject.toml')
+
+        test_results = get_test_results(doctester, docstrings_pass)
+        self.assertEqual(len(test_results), 3)
+
+        for tr in test_results:
+            self.assertFalse(tr.fail)
+
+        test_results = get_test_results(doctester, docstrings_from_sampcd)
+        self.assertEqual(len(test_results), 15)
+
+        for tr in test_results:
+            print(tr.msg)
+            self.assertFalse(tr.fail)
+
+    def test_mypy_fail(self):
+        docstrings_fail = {
+            'fail_simple': """
+            placeholder
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> import blabla
+            """,
+            'multi': """
+            placeholder
+
+            .. code-block:: python
+                :name: code-example-0
+
+                this is some blabla...
+
+                >>> # doctest: +SKIP('skip')
+                >>> print(1+1)
+                2
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> # doctest: -REQUIRES(env:GPU)
+                    >>> blabla
+                    >>> print(1-1)
+                    0
+
+                .. code-block:: python
+                    :name: code-example-2
+
+                    this is some blabla...
+
+                    >>> # doctest: +REQUIRES(env:GPU, env:XPU, env: DISTRIBUTED)
+                    >>> blabla
+                    >>> print(1-1)
+                    0
+            """,
+        }
+
+        doctester = MypyChecker('../pyproject.toml')
+
+        test_results = get_test_results(doctester, docstrings_fail)
+        self.assertEqual(len(test_results), 3)
+
+        for tr in test_results:
+            self.assertTrue(tr.fail)
+
+    def test_mypy_partial_fail(self):
+        docstrings_fail = {
+            'multi': """
+            placeholder
+
+            .. code-block:: python
+                :name: code-example-0
+
+                this is some blabla...
+
+                >>> # doctest: +SKIP('skip')
+                >>> print(1+1)
+                2
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> # doctest: -REQUIRES(env:GPU)
+                    >>> blabla
+                    >>> print(1-1)
+                    0
+
+                .. code-block:: python
+                    :name: code-example-2
+
+                    this is some blabla...
+
+                    >>> # doctest: +REQUIRES(env:GPU, env:XPU, env: DISTRIBUTED)
+                    >>> print(1-1)
+                    0
+            """
+        }
+
+        doctester = MypyChecker('../pyproject.toml')
+
+        test_results = get_test_results(doctester, docstrings_fail)
+        self.assertEqual(len(test_results), 2)
+
+        tr_0, tr_1 = test_results
+        self.assertTrue(tr_0.fail)
+        self.assertFalse(tr_1.fail)
+
+    def test_mypy_ignore(self):
+        docstrings_ignore = {
+            'fail_simple': """
+            placeholder
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> # type: ignore
+                    >>> import blabla
+            """,
+            'multi': """
+            placeholder
+
+            .. code-block:: python
+                :name: code-example-0
+
+                this is some blabla...
+
+                >>> # doctest: +SKIP('skip')
+                >>> print(1+1)
+                2
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> # type: ignore
+                    >>> # doctest: -REQUIRES(env:GPU)
+                    >>> blabla
+                    >>> print(1-1)
+                    0
+
+                .. code-block:: python
+                    :name: code-example-2
+
+                    this is some blabla...
+
+                    >>> # type: ignore
+                    >>> # doctest: +REQUIRES(env:GPU, env:XPU, env: DISTRIBUTED)
+                    >>> blabla
+                    >>> print(1-1)
+                    0
+            """,
+        }
+
+        doctester = MypyChecker('../pyproject.toml')
+
+        test_results = get_test_results(doctester, docstrings_ignore)
+        self.assertEqual(len(test_results), 3)
+
+        for tr in test_results:
+            print(tr.msg)
+            self.assertFalse(tr.fail)
+
+        docstrings_pass = {
+            'pass': """
+            placeholder
+
+            .. code-block:: python
+                :name: code-example-0
+
+                this is some blabla...
+
+                >>> # doctest: +SKIP('skip')
+                >>> print(1+1)
+                2
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> a = 1
+                    >>> # type: ignore
+                    >>> # doctest: -REQUIRES(env:GPU)
+                    >>> blabla
+                    >>> print(1-1)
+                    0
+
+                .. code-block:: python
+                    :name: code-example-2
+
+                    this is some blabla...
+
+                    >>> b = 2
+                    >>> # type: ignore
+                    >>> # doctest: +REQUIRES(env:GPU, env:XPU, env: DISTRIBUTED)
+                    >>> blabla
+                    >>> print(1-1)
+                    0
+            """,
+        }
+
+        doctester = MypyChecker('../pyproject.toml')
+
+        test_results = get_test_results(doctester, docstrings_pass)
+        self.assertEqual(len(test_results), 2)
+
+        for tr in test_results:
+            print(tr.msg)
+            self.assertFalse(tr.fail)
+
+        docstrings_fail = {
+            'fail': """
+            placeholder
+
+            .. code-block:: python
+                :name: code-example-0
+
+                this is some blabla...
+
+                >>> # doctest: +SKIP('skip')
+                >>> print(1+1)
+                2
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> import blabla
+                    >>> a = 1
+                    >>> # type: ignore
+                    >>> # doctest: -REQUIRES(env:GPU)
+                    >>> blabla
+                    >>> print(1-1)
+                    0
+
+                .. code-block:: python
+                    :name: code-example-2
+
+                    this is some blabla...
+
+                    >>> import blabla
+                    >>> # type: ignore
+                    >>> # doctest: +REQUIRES(env:GPU, env:XPU, env: DISTRIBUTED)
+                    >>> blabla
+                    >>> print(1-1)
+                    0
+            """,
+        }
+
+        doctester = MypyChecker('../pyproject.toml')
+
+        test_results = get_test_results(doctester, docstrings_fail)
+        self.assertEqual(len(test_results), 2)
+
+        for tr in test_results:
+            print(tr.msg)
+            self.assertTrue(tr.fail)
diff --git a/tools/timeline.py b/tools/timeline.py
index ff8d0946378d7..5e16e0b9bf4f3 100644
--- a/tools/timeline.py
+++ b/tools/timeline.py
@@ -148,7 +148,7 @@ def _allocate_pids(self):
                         self._devices[(k, event.device_id, "CPU")] = pid
                         # -1 device id represents CUDA API(RunTime) call.(e.g. cudaLaunch, cudaMemcpy)
                         if event.device_id == -1:
-                            self._chrome_trace.emit_pid("%s:cuda_api" % k, pid)
+                            self._chrome_trace.emit_pid(f"{k}:cuda_api", pid)
                         else:
                             self._chrome_trace.emit_pid(
                                 "%s:cpu:block:%d" % (k, event.device_id), pid
diff --git a/tools/timeout_debug_help.sh b/tools/timeout_debug_help.sh
index 45de2db87e853..fcc6d473e49eb 100644
--- a/tools/timeout_debug_help.sh
+++ b/tools/timeout_debug_help.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -17,7 +17,7 @@ set +e
 failed_uts=$1
 need_debug_ut_re='test_dist_fleet'
 cat_log_judge=$(echo "${failed_uts}" | grep 'Timeout' |  grep -oEi "$need_debug_ut_re" )
-if [[ "$cat_log_judge" != "" ]];then 
+if [[ "$cat_log_judge" != "" ]];then
     echo "=============================================="
     echo "show timeout ut logs"
     echo "=============================================="
diff --git a/tools/type_checking.py b/tools/type_checking.py
new file mode 100644
index 0000000000000..78285cb87eaa4
--- /dev/null
+++ b/tools/type_checking.py
@@ -0,0 +1,276 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# We type-check the `Example` codes from docstring.
+
+from __future__ import annotations
+
+import argparse
+import doctest
+import pathlib
+import re
+from abc import abstractmethod
+from concurrent.futures import ProcessPoolExecutor
+from dataclasses import dataclass, field
+from typing import Any
+
+from mypy import api as mypy_api
+from sampcd_processor_utils import (
+    extract_code_blocks_from_docstr,
+    get_docstring,
+    init_logger,
+    log_exit,
+    logger,
+)
+
+
+class TypeChecker:
+    style: str = 'google'
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        pass
+
+    @abstractmethod
+    def run(self, api_name: str, codeblock: str) -> TestResult:
+        pass
+
+    @abstractmethod
+    def print_summary(
+        self, test_results: list[TestResult], whl_error: list[str]
+    ) -> None:
+        pass
+
+
+@dataclass
+class TestResult:
+    api_name: str
+    msg: str
+    fail: bool = False
+    extra_info: dict[str, Any] = field(default_factory=dict)
+
+
+class MypyChecker(TypeChecker):
+    def __init__(
+        self, config_file: str, cache_dir: str, *args: Any, **kwargs: Any
+    ) -> None:
+        self.config_file = config_file
+        self.cache_dir = cache_dir
+        super().__init__(*args, **kwargs)
+
+    def run(self, api_name: str, codeblock: str) -> TestResult:
+        # skip checking when the codeblock startswith `>>> # type: ignore`
+        codeblock_for_checking = []
+        for line in codeblock.splitlines():
+            if line.strip().startswith('>>> # type: ignore'):
+                break
+            codeblock_for_checking.append(line)
+        codeblock_for_checking = '\n'.join(codeblock_for_checking)
+
+        # remove `doctest` in the codeblock, or the module `doctest` cannot `get_examples`` correctly
+        codeblock_for_checking = re.sub(
+            r'#\s*x?doctest\s*:.*', '', codeblock_for_checking
+        )
+
+        # `get_examples` codes with `>>>` and `...` stripped
+        _example_code = doctest.DocTestParser().get_examples(
+            codeblock_for_checking
+        )
+        example_code = '\n'.join(
+            [l for e in _example_code for l in e.source.splitlines()]
+        )
+
+        normal_report, error_report, exit_status = mypy_api.run(
+            [
+                f'--config-file={self.config_file}',
+                f'--cache-dir={self.cache_dir}',
+                '-c',
+                example_code,
+            ]
+        )
+
+        logger.debug('-' * 20)
+        logger.debug(f'>>> Type hints with api {api_name} start ...')
+        logger.debug(example_code)
+        logger.debug('>>> Results ...')
+        logger.debug('>>> mypy normal_report is ...')
+        logger.debug(normal_report)
+        logger.debug('>>> mypy error_report is ...')
+        logger.debug(error_report)
+        logger.debug('>>> mypy exit_status is ...')
+        logger.debug(exit_status)
+        logger.debug(f'>>> Type hints with api {api_name} end...')
+
+        return TestResult(
+            api_name=api_name,
+            msg='\n'.join([normal_report, error_report]),
+            fail=exit_status != 0,
+            extra_info={
+                'normal_report': normal_report,
+                'error_report': error_report,
+                'exit_status': exit_status,
+            },
+        )
+
+    def print_summary(
+        self, test_results: list[TestResult], whl_error: list[str]
+    ) -> None:
+        is_fail = False
+
+        logger.warning("----------------Check results--------------------")
+
+        if whl_error is not None and whl_error:
+            logger.warning("%s is not in whl.", whl_error)
+            logger.warning("")
+            logger.warning("Please check the whl package and API_PR.spec!")
+            logger.warning(
+                "You can follow these steps in order to generate API.spec:"
+            )
+            logger.warning("1. cd ${paddle_path}, compile paddle;")
+            logger.warning(
+                "2. pip install build/python/dist/(build whl package);"
+            )
+            logger.warning(
+                "3. run 'python tools/print_signatures.py paddle > paddle/fluid/API.spec'."
+            )
+            for test_result in test_results:
+                if test_result.fail:
+                    logger.error(
+                        ">>> In addition, mistakes found in type checking: %s",
+                        test_result.api_name,
+                    )
+                    logger.error(test_result.msg)
+            log_exit(1)
+
+        else:
+            for test_result in test_results:
+                if test_result.fail:
+                    is_fail = True
+
+                    logger.error(test_result.api_name)
+                    logger.error(test_result.msg)
+
+                else:
+                    logger.debug(test_result.api_name)
+                    logger.debug(test_result.msg)
+
+            if is_fail:
+                logger.error(">>> Mistakes found in type checking!")
+                logger.error(">>> Please recheck the type annotations.")
+                log_exit(1)
+
+        logger.warning(">>> Type checking is successful!")
+        logger.warning("----------------End of the Check--------------------")
+
+
+def parse_args() -> argparse.Namespace:
+    """
+    Parse input arguments
+    """
+    parser = argparse.ArgumentParser(
+        description='run Sample Code Type Checking'
+    )
+    parser.add_argument('--debug', dest='debug', action="store_true")
+    parser.add_argument(
+        '--logf', dest='logf', type=str, default=None, help='file for logging'
+    )
+    parser.add_argument(
+        '--config-file',
+        dest='config_file',
+        type=str,
+        default=None,
+        help='config file for type checker',
+    )
+    parser.add_argument(
+        '--cache-dir',
+        dest='cache_dir',
+        type=str,
+        default=None,
+        help='cache dir for mypy',
+    )
+    parser.add_argument('--full-test', dest='full_test', action="store_true")
+
+    args = parser.parse_args()
+    return args
+
+
+def get_test_results(
+    type_checker: TypeChecker, docstrings_to_test: dict[str, str]
+) -> list[TestResult]:
+    _test_style = (
+        type_checker.style
+        if type_checker.style in {'google', 'freeform'}
+        else 'google'
+    )
+    google_style = _test_style == 'google'
+
+    api_names = []
+    codeblocks = []
+    for api_name, raw_docstring in docstrings_to_test.items():
+        # we may extract more than one codeblocks from docsting.
+        for codeblock in extract_code_blocks_from_docstr(
+            raw_docstring, google_style=google_style
+        ):
+            codeblock_name = codeblock['name']
+            codeblock_id = codeblock['id']
+
+            api_names.append(f'{api_name}:{codeblock_name or codeblock_id}')
+            codeblocks.append(codeblock['codes'])
+
+    test_results = []
+    with ProcessPoolExecutor() as exe:
+        test_results = exe.map(
+            type_checker.run, api_names, codeblocks, timeout=600
+        )
+
+    return list(test_results)
+
+
+def run_type_checker(
+    args: argparse.Namespace, type_checker: TypeChecker
+) -> None:
+    # init logger
+    init_logger(debug=args.debug, log_file=args.logf)
+
+    logger.info(
+        "----------------Codeblock Type Checking Start--------------------"
+    )
+
+    logger.info(">>> Get docstring from api ...")
+    filter_api = lambda api_name: 'libpaddle' in api_name
+    docstrings_to_test, whl_error = get_docstring(
+        full_test=args.full_test, filter_api=filter_api
+    )
+
+    logger.info(">>> Running type checker ...")
+    test_results = get_test_results(type_checker, docstrings_to_test)
+
+    logger.info(">>> Print summary ...")
+    type_checker.print_summary(test_results, whl_error)
+
+
+if __name__ == '__main__':
+    base_path = pathlib.Path(__file__).resolve().parent.parent
+
+    args = parse_args()
+    mypy_checker = MypyChecker(
+        config_file=(
+            args.config_file
+            if args.config_file
+            else (base_path / 'pyproject.toml')
+        ),
+        cache_dir=(
+            args.cache_dir if args.cache_dir else (base_path / '.mypy_cache')
+        ),
+    )
+    run_type_checker(args, mypy_checker)
diff --git a/tools/windows/build_compile_environment.bat b/tools/windows/build_compile_environment.bat
index 884cea8ca4cd0..016e2a4ff25cb 100644
--- a/tools/windows/build_compile_environment.bat
+++ b/tools/windows/build_compile_environment.bat
@@ -16,7 +16,7 @@
 :: Build Paddle compile environment
 :: ===============================
 :: Description:
-::   
+::
 ::   Install compile environment for xly CI.
 ::
 ::   Include:
@@ -55,7 +55,7 @@ if %errorlevel% == 0 (
 ) else (
   echo Error***** Download wget tool failed, please download it before rerun.
   exit /b 1
-) 
+)
 goto :eof
 :: ===== end step 0: wget tool =====
 
@@ -296,7 +296,7 @@ goto tensorrt
 echo There is not sccache in this PC, will install sccache.
 echo Download package from https://paddle-ci.gz.bcebos.com/window_requirement/sccache.exe
 wget -O sccache.exe "https://paddle-ci.gz.bcebos.com/window_requirement/sccache.exe"
-copy sccache.exe C:\Python38 /Y 
+copy sccache.exe C:\Python38 /Y
 goto :eof
 :: ===== end step 7: sccache on windows =====
 
diff --git a/tools/windows/check_change_of_unittest.sh b/tools/windows/check_change_of_unittest.sh
index 576f0e5d238ab..25073435e3fb2 100644
--- a/tools/windows/check_change_of_unittest.sh
+++ b/tools/windows/check_change_of_unittest.sh
@@ -19,7 +19,7 @@ GITHUB_API_TOKEN=$GITHUB_API_TOKEN
 GIT_PR_ID=$AGILE_PULL_ID
 BRANCH=$BRANCH
 if [ "${GITHUB_API_TOKEN}" == "" ] || [ "${GIT_PR_ID}" == "" ];then
-    exit 0 
+    exit 0
 fi
 
 unittest_spec_diff=$(cat $PADDLE_ROOT/deleted_ut | sed 's/^/ - /g')
diff --git a/tools/windows/run_unittests.sh b/tools/windows/run_unittests.sh
index ebca6e41296fd..5d259e101b56d 100644
--- a/tools/windows/run_unittests.sh
+++ b/tools/windows/run_unittests.sh
@@ -16,15 +16,15 @@
 # /*================Fixed Disabled Windows CUDA10.x MKL(PR-CI-Windows) unittests===========================*/
 # TODO: fix these unittest that is bound to fail
 disable_wingpu_test="^test_model$|\
-^test_dataloader_early_reset$|\
-^test_add_reader_dependency$|\
+^test_dataloader_early_reset_deprecated$|\
+^test_add_reader_dependency_deprecated$|\
 ^test_add_reader_dependency_for_interpretercore$|\
-^test_decoupled_py_reader$|\
-^test_decoupled_py_reader_static_build$|\
+^test_decoupled_py_reader_deprecated$|\
+^test_decoupled_py_reader_deprecated_static_build$|\
 ^test_generator_dataloader_deprecated$|\
 ^test_parallel_dygraph_sync_batch_norm$|\
 ^test_py_reader_using_executor$|\
-^test_program_prune_backward$|\
+^test_program_prune_backward_deprecated$|\
 ^test_decoupled_py_reader_data_check$|\
 ^test_fleet_base_single$|\
 ^test_multiprocess_dataloader_iterable_dataset_dynamic$|\
@@ -35,11 +35,11 @@ disable_wingpu_test="^test_model$|\
 ^test_imperative_se_resnext$|\
 ^test_sync_batch_norm_op$|\
 ^test_sync_batch_norm_op_static_build$|\
-^test_dataloader_keep_order$|\
-^test_dataloader_unkeep_order$|\
+^test_dataloader_keep_order_deprecated$|\
+^test_dataloader_unkeep_order_deprecated$|\
 ^test_multiprocess_dataloader_iterable_dataset_static$|\
-^test_fuse_bn_act_pass$|\
-^test_fuse_bn_act_pass_static_build$|\
+^test_fuse_bn_act_pass_deprecated$|\
+^test_fuse_bn_act_pass_deprecated_static_build$|\
 ^test_fuse_bn_add_act_pass$|\
 ^test_gather_op$|\
 ^test_activation_op$|\
@@ -184,7 +184,7 @@ disable_wingpu_cuda12_test="^test_cholesky_op$|\
 ^test_functional_conv3d$|\
 ^test_functional_conv3d_transpose$|\
 ^test_imperative_layer_children$|\
-^test_inference_api$|\
+^test_inference_api_deprecated$|\
 ^test_trans_layout_op$|\
 ^test_pool2d_op$|\
 ^test_conv3d_transpose_op$|\
@@ -211,7 +211,7 @@ disable_wingpu_cuda12_test="^test_cholesky_op$|\
 ^test_callback_visualdl$|\
 ^test_callback_wandb$|\
 ^test_user_defined_quantization$|\
-^test_quantization_scale_pass$|\
+^test_quantization_scale_pass_deprecated$|\
 ^test_quantization_pass$|\
 ^test_imperative_qat$|\
 ^test_graph$|\
@@ -219,24 +219,24 @@ disable_wingpu_cuda12_test="^test_cholesky_op$|\
 ^test_gru_unit_op$|\
 ^test_matmul_op$|\
 ^test_decoupled_py_reader_data_check$|\
-^test_decoupled_py_reader$|\
+^test_decoupled_py_reader_deprecated$|\
 ^test_generator_dataloader_deprecated$|\
 ^test_py_reader_combination$|\
 ^test_reader_reset$|\
 ^test_sync_batch_norm_op$|\
 ^test_sync_batch_norm_op_static_build$|\
-^test_decoupled_py_reader_static_build$|\
+^test_decoupled_py_reader_deprecated_static_build$|\
 ^test_multiprocess_dataloader_iterable_dataset_dynamic$|\
 ^test_multiprocess_dataloader_iterable_dataset_static$|\
-^test_dataloader_keep_order$|\
-^test_dataloader_unkeep_order$|\
-^test_add_reader_dependency$|\
-^test_fuse_bn_act_pass$|\
-^test_fuse_bn_act_pass_static_build$|\
+^test_dataloader_keep_order_deprecated$|\
+^test_dataloader_unkeep_order_deprecated$|\
+^test_add_reader_dependency_deprecated$|\
+^test_fuse_bn_act_pass_deprecated$|\
+^test_fuse_bn_act_pass_deprecated_static_build$|\
 ^test_fuse_bn_add_act_pass$|\
 ^test_model$|\
-^test_dataloader_early_reset$|\
-^test_add_reader_dependency$|\
+^test_dataloader_early_reset_deprecated$|\
+^test_add_reader_dependency_deprecated$|\
 ^test_conv2d_fusion_op$|\
 ^test_fused_conv2d_add_act_op$|\
 ^test_analyzer_detect_functional_mkldnn$|\
@@ -351,6 +351,7 @@ disable_win_inference_test="^trt_quant_int8_yolov3_r50_test$|\
 ^test_custom_relu_op_setup$|\
 ^test_conv3d_transpose_part2_op$|\
 ^test_deform_conv2d$|\
+^test_deform_conv2d_deprecated$|\
 ^test_matmul_op$|\
 ^test_matmul_op_static_build$|\
 ^test_basic_api_transformation$|\
@@ -370,7 +371,7 @@ disable_win_inference_test="^trt_quant_int8_yolov3_r50_test$|\
 ^test_graph_khop_sampler$|\
 ^test_gru_rnn_op$|\
 ^test_masked_select_op$|\
-^test_ir_fc_fuse_pass$|\
+^test_ir_fc_fuse_pass_deprecated$|\
 ^test_fc_elementwise_layernorm_fuse_pass$|\
 ^test_linalg_pinv_op$|\
 ^test_math_op_patch_var_base$|\
@@ -395,23 +396,23 @@ disable_win_inference_test="^trt_quant_int8_yolov3_r50_test$|\
 ^test_py_reader_pin_memory$|\
 ^test_multiprocess_dataloader_iterable_dataset_dynamic$|\
 ^test_multiprocess_dataloader_iterable_dataset_static$|\
-^test_add_reader_dependency$|\
+^test_add_reader_dependency_deprecated$|\
 ^test_add_reader_dependency_for_interpretercore$|\
 ^test_compat$|\
-^test_decoupled_py_reader$|\
-^test_decoupled_py_reader_static_build$|\
+^test_decoupled_py_reader_deprecated$|\
+^test_decoupled_py_reader_deprecated_static_build$|\
 ^test_generator_dataloader_deprecated$|\
 ^test_py_reader_using_executor$|\
-^test_dataloader_keep_order$|\
-^test_dataloader_unkeep_order$|\
+^test_dataloader_keep_order_deprecated$|\
+^test_dataloader_unkeep_order_deprecated$|\
 ^test_sync_batch_norm_op$|\
 ^test_sync_batch_norm_op_static_build$|\
-^test_fuse_bn_act_pass$|\
-^test_fuse_bn_act_pass_static_build$|\
+^test_fuse_bn_act_pass_deprecated$|\
+^test_fuse_bn_act_pass_deprecated_static_build$|\
 ^test_fuse_bn_add_act_pass$|\
 ^test_decoupled_py_reader_data_check$|\
 ^test_parallel_dygraph_sync_batch_norm$|\
-^test_dataloader_early_reset$|\
+^test_dataloader_early_reset_deprecated$|\
 ^test_fleet_base_single$|\
 ^test_sequence_pool$|\
 ^test_simplify_with_basic_ops_pass_autoscan$|\
@@ -436,7 +437,7 @@ disable_wincpu_test="^jit_kernel_test$|\
 ^test_vision_models$|\
 ^test_dygraph_multi_forward$|\
 ^test_imperative_transformer_sorted_gradient$|\
-^test_program_prune_backward$|\
+^test_program_prune_backward_deprecated$|\
 ^test_imperative_resnet$|\
 ^test_imperative_resnet_sorted_gradient$|\
 ^test_imperative_se_resnext$|\
@@ -464,7 +465,7 @@ long_time_test="^test_gru_op$|\
 ^test_cross_op$|\
 ^test_elementwise_nn_grad$|\
 ^test_fused_elemwise_activation_op$|\
-^test_imperative_lod_tensor_to_selected_rows$|\
+^test_imperative_lod_tensor_to_selected_rows_deprecated$|\
 ^test_imperative_selected_rows_to_lod_tensor$|\
 ^test_layer_norm_op$|\
 ^test_layer_norm_op_static_build$|\
diff --git a/tools/xpu/get_xpti_dependence.sh b/tools/xpu/get_xpti_dependence.sh
index 95cc4a110ed6d..6801990933d76 100644
--- a/tools/xpu/get_xpti_dependence.sh
+++ b/tools/xpu/get_xpti_dependence.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.