diff --git a/.clang-format b/.clang-format
index 04f2bbaf85b2c..a4de8e7be8e07 100644
--- a/.clang-format
+++ b/.clang-format
@@ -6,11 +6,11 @@
 # The basic usage is,
 #   clang-format -i -style=file PATH/TO/SOURCE/CODE
 #
-# The -style=file implicit use ".clang-format" file located in one of 
-# parent directory. 
+# The -style=file implicit use ".clang-format" file located in one of
+# parent directory.
 # The -i means inplace change.
 #
-# The document of clang-format is 
+# The document of clang-format is
 #   http://clang.llvm.org/docs/ClangFormat.html
 #   http://clang.llvm.org/docs/ClangFormatStyleOptions.html
 ---
@@ -20,7 +20,7 @@ IndentWidth:     2
 TabWidth:        2
 ContinuationIndentWidth: 4
 AccessModifierOffset: -1  # The private/protected/public has no indent in class
-Standard:  Cpp11 
+Standard:  Cpp11
 AllowAllParametersOfDeclarationOnNextLine: true
 BinPackParameters: false
 BinPackArguments: false
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 1fcb3dc4f521d..7b62f131b9587 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -53,7 +53,6 @@ python/paddle/base/compiler.py @XiaoguangHu01 @zhiqiu @Xreki @qili93 @Aurelius84
 python/paddle/base/dygraph/layers.py @JiabinYang @phlrain
 python/paddle/base/framework.py @XiaoguangHu01 @zhiqiu @Xreki @qili93 @Aurelius84
 python/paddle/base/__init__.py @phlrain @Aurelius84 @qili93
-python/paddle/base/parallel_executor.py @Xreki @zhhsplendid @Aurelius84
 python/paddle/base/tests/unittests/white_list/check_op_sequence_batch_1_input_white_list.py @Aurelius84 @phlrain
 python/paddle/base/tests/unittests/white_list/check_op_sequence_instance_0_input_white_list.py @Aurelius84 @phlrain
 python/paddle/base/tests/unittests/white_list/check_shape_white_list.py @hong19860320 @Aurelius84 @phlrain
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0aa41a26d700e..f0b2fa79d362a 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -99,6 +99,9 @@ if(WITH_GPU AND WITH_ROCM)
 endif()
 
 if(WITH_GPU AND NOT APPLE)
+  if(WITH_PIP_CUDA_LIBRARIES AND CMAKE_SYSTEM_NAME STREQUAL "Windows")
+    add_definitions(-DPADDLE_WITH_PIP_CUDA_LIBRARIES)
+  endif()
   #(Note risemeup1): The cudart dynamic library libcudart.so is used by set CUDA_USE_STATIC_CUDA_RUNTIME and CMAKE_CUDA_FLAGS
   if(CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL
                                             "x86_64")
@@ -107,8 +110,8 @@ if(WITH_GPU AND NOT APPLE)
         CACHE BOOL "" FORCE)
     set(CMAKE_CUDA_FLAGS "--cudart shared")
     if(WITH_PIP_CUDA_LIBRARIES)
-      #(Note risemeup1): Flag 'WITH_PIP_CUDA_LIBRARIES' will be used in dynamic_loader.cc to search for CUDA-related .so files through the Python libraries provided by NVIDIA.
-      add_definitions(-DWITH_PIP_CUDA_LIBRARIES)
+      #(Note risemeup1): Flag 'PADDLE_WITH_PIP_CUDA_LIBRARIES' will be used in dynamic_loader.cc to search for CUDA-related .so files through the Python libraries provided by NVIDIA.
+      add_definitions(-DPADDLE_WITH_PIP_CUDA_LIBRARIES)
     endif()
   endif()
   enable_language(CUDA)
diff --git a/cmake/PaddleConfig.cmake.in b/cmake/PaddleConfig.cmake.in
index d32c23f6f6edd..e55038bb77c63 100644
--- a/cmake/PaddleConfig.cmake.in
+++ b/cmake/PaddleConfig.cmake.in
@@ -12,7 +12,7 @@
 get_filename_component(PADDLE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_FILE}/../.." ABSOLUTE)
 
 # include directories
-set(PADDLE_INCLUDE_DIRS 
+set(PADDLE_INCLUDE_DIRS
     ${PADDLE_INSTALL_PREFIX}/include
     ${PADDLE_INSTALL_PREFIX}/include/third_party
 )
diff --git a/cmake/make_resource.py b/cmake/make_resource.py
index ad8ee179d60c2..e80900da58777 100644
--- a/cmake/make_resource.py
+++ b/cmake/make_resource.py
@@ -24,7 +24,7 @@
     "const unsigned char "
     + var
     + "[] = {"
-    + ",".join(["0x%02x" % ord(c) for c in open(res).read()])
+    + ",".join([f"0x{ord(c):02x}" for c in open(res).read()])
     + ",0};\n"
     + "const unsigned "
     + var
diff --git a/paddle/cinn/ast_gen_ius/ast_gen.cc b/paddle/cinn/ast_gen_ius/ast_gen.cc
index 42986fff0dbb1..54805f2c78f50 100644
--- a/paddle/cinn/ast_gen_ius/ast_gen.cc
+++ b/paddle/cinn/ast_gen_ius/ast_gen.cc
@@ -131,6 +131,7 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) {
       } else {
         iter_values.push_back(axis_vars[i]);
       }
+      ir::TryElevateInt32ToInt64({ir::Expr(axis_vars[i]), shape[i]});
     }
     VLOG(4) << "iter_value.size() and block_vars.size() is "
             << iter_values.size() << " " << block_vars.size();
@@ -167,6 +168,7 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) {
       } else {
         reduce_iter_values.push_back(axis_vars[i]);
       }
+      ir::TryElevateInt32ToInt64({ir::Expr(axis_vars[i]), shape[i]});
     }
     VLOG(4) << "ast gen: reduce body is after replace 0" << reduce_body;
     for (int i = 0; i < reduce_axis.size(); ++i) {
@@ -227,6 +229,9 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) {
         ir::ScheduleBlock::Make(
             reduce_block_vars, {}, {}, tensor->name, reduce_body));
     for (int i = static_cast<int>(reduce_axis.size()) - 1; i >= 0; --i) {
+      ir::TryElevateInt32ToInt64({reduce_axis[i],
+                                  reduce_axis[i]->lower_bound,
+                                  reduce_axis[i]->upper_bound});
       reduce_body = ir::For::Make(reduce_axis[i],
                                   reduce_axis[i]->lower_bound,
                                   reduce_axis[i]->upper_bound,
diff --git a/paddle/cinn/auto_schedule/analysis/analyze_ir.cc b/paddle/cinn/auto_schedule/analysis/analyze_ir.cc
index 6f00ee34813d1..c51ba89806956 100644
--- a/paddle/cinn/auto_schedule/analysis/analyze_ir.cc
+++ b/paddle/cinn/auto_schedule/analysis/analyze_ir.cc
@@ -32,7 +32,7 @@
 #include "paddle/cinn/lang/lower.h"
 #include "paddle/cinn/optim/optimize.h"
 #include "paddle/cinn/optim/transform_gpu_forloop.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace auto_schedule {
 
@@ -193,10 +193,14 @@ ir::LoweredFunc UpdateFuncWithNewBody(const cinn::common::Target& target,
 std::unordered_set<std::string> GetReduceLoopVarNames(const ir::Expr block) {
   const ir::ScheduleBlockRealize* block_realize =
       block.As<ir::ScheduleBlockRealize>();
-  CHECK_NOTNULL(block_realize);
+  PADDLE_ENFORCE_NOT_NULL(
+      block_realize,
+      phi::errors::InvalidArgument("The block is not a ScheduleBlockRealize"));
   const ir::ScheduleBlock* block_node =
       block_realize->schedule_block.As<ir::ScheduleBlock>();
-  CHECK_NOTNULL(block_node);
+  PADDLE_ENFORCE_NOT_NULL(
+      block_node,
+      phi::errors::InvalidArgument("The block is not a ScheduleBlock"));
   std::vector<ir::Expr> iter_values = block_realize->iter_values;
   std::vector<ir::Var> iter_vars = block_node->iter_vars;
 
@@ -218,10 +222,14 @@ std::unordered_set<std::string> GetReduceLoopVarNames(const ir::Expr block) {
 std::string GetBlockName(const ir::Expr block) {
   const ir::ScheduleBlockRealize* block_realize =
       block.As<ir::ScheduleBlockRealize>();
-  CHECK_NOTNULL(block_realize);
+  PADDLE_ENFORCE_NOT_NULL(
+      block_realize,
+      phi::errors::InvalidArgument("The block is not a ScheduleBlockRealize"));
   const ir::ScheduleBlock* block_node =
       block_realize->schedule_block.As<ir::ScheduleBlock>();
-  CHECK_NOTNULL(block_node);
+  PADDLE_ENFORCE_NOT_NULL(
+      block_node,
+      phi::errors::InvalidArgument("The block is not a ScheduleBlock"));
   return block_node->name;
 }
 
diff --git a/paddle/cinn/auto_schedule/auto_tuner.cc b/paddle/cinn/auto_schedule/auto_tuner.cc
index d45dcc743e525..9524e1ed3048f 100644
--- a/paddle/cinn/auto_schedule/auto_tuner.cc
+++ b/paddle/cinn/auto_schedule/auto_tuner.cc
@@ -34,7 +34,7 @@
 #include "paddle/cinn/hlir/framework/op.h"
 #include "paddle/cinn/hlir/framework/visualize_helper.h"
 #include "paddle/cinn/utils/string.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace auto_schedule {
 
@@ -144,9 +144,10 @@ void PrintResult(const TuningResult& result) {
 }
 
 TuningResult AutoTuner::Tune(const TuningOptions& options) {
-  CHECK_GT(options.num_tuning_rounds, 0) << "Invalid config";
-  VLOG(3) << "Begin tuning with round num=" << options.num_tuning_rounds
-          << ", tasks size=" << tasks_.size();
+  PADDLE_ENFORCE_GT(options.num_tuning_rounds,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The num_tuning_rounds should be greater than 0."));
 
   TuningResult result;
   result.subgraphs.resize(tasks_.size());
diff --git a/paddle/cinn/auto_schedule/cost_model/expr_cost_model.cc b/paddle/cinn/auto_schedule/cost_model/expr_cost_model.cc
index a9074c76fa8cf..54396ecaa6e2e 100644
--- a/paddle/cinn/auto_schedule/cost_model/expr_cost_model.cc
+++ b/paddle/cinn/auto_schedule/cost_model/expr_cost_model.cc
@@ -24,7 +24,7 @@
 #include "paddle/cinn/auto_schedule/search_space/search_state.h"
 #include "paddle/cinn/common/target.h"
 #include "paddle/cinn/ir/schedule/ir_schedule.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace auto_schedule {
 
@@ -45,8 +45,10 @@ void ExprCostModel::Train(const std::vector<const ir::ModuleExpr*>& samples,
                           const cinn::common::Target& target) {
   trained_times_.store(1);
   size_t total_size = samples.size();
-  CHECK_EQ(total_size, labels.size())
-      << "Samples must have same size as labels";
+  PADDLE_ENFORCE_EQ(
+      total_size,
+      labels.size(),
+      phi::errors::InvalidArgument("Samples must have same size as labels"));
   std::vector<std::vector<float>> train_feature_numbers(total_size);
   FeatureExtractor extractor;
   for (size_t i = 0; i < total_size; ++i) {
@@ -63,8 +65,10 @@ void ExprCostModel::Update(const std::vector<const ir::ModuleExpr*>& samples,
                            const cinn::common::Target& target) {
   ++trained_times_;
   size_t total_size = samples.size();
-  CHECK_EQ(total_size, labels.size())
-      << "Samples must have same size as labels";
+  PADDLE_ENFORCE_EQ(
+      total_size,
+      labels.size(),
+      phi::errors::InvalidArgument("Samples must have same size as labels"));
   std::vector<std::vector<float>> train_feature_numbers(total_size);
   FeatureExtractor extractor;
   for (size_t i = 0; i < total_size; ++i) {
diff --git a/paddle/cinn/auto_schedule/database/database.cc b/paddle/cinn/auto_schedule/database/database.cc
index 2036b44a83fef..ee8277b9dadd6 100644
--- a/paddle/cinn/auto_schedule/database/database.cc
+++ b/paddle/cinn/auto_schedule/database/database.cc
@@ -22,7 +22,7 @@
 #include "paddle/cinn/auto_schedule/task/task_registry.h"
 #include "paddle/cinn/ir/schedule/ir_schedule.h"
 #include "paddle/cinn/ir/schedule/schedule_desc.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace auto_schedule {
 
@@ -42,8 +42,10 @@ proto::TuningRecord TuningRecord::ToProto() const {
 
 Database::Database(int capacity_per_task)
     : capacity_per_task_(capacity_per_task) {
-  CHECK_GT(capacity_per_task_, 0)
-      << "capacity_per_task_ should be greater than 0";
+  PADDLE_ENFORCE_GT(capacity_per_task_,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "capacity_per_task_ should be greater than 0"));
 }
 
 std::unique_ptr<Database> Database::Make(const DatabaseConfig& config) {
diff --git a/paddle/cinn/auto_schedule/measure/simple_builder.cc b/paddle/cinn/auto_schedule/measure/simple_builder.cc
index 5be5b8528616f..0636cfc2b79fa 100644
--- a/paddle/cinn/auto_schedule/measure/simple_builder.cc
+++ b/paddle/cinn/auto_schedule/measure/simple_builder.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/cinn/auto_schedule/measure/simple_builder.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace auto_schedule {
 
@@ -25,8 +25,10 @@ SimpleBuilder::SimpleBuilder(hlir::framework::GraphCompiler* graph_compiler)
     : graph_compiler_(graph_compiler) {}
 
 BuildResult SimpleBuilder::Build(const MeasureInput& input) {
-  CHECK_NE(graph_compiler_, static_cast<GraphCompiler*>(nullptr))
-      << "empty handle to GraphCompiler";
+  PADDLE_ENFORCE_NE(
+      graph_compiler_,
+      static_cast<GraphCompiler*>(nullptr),
+      phi::errors::InvalidArgument("empty handle to GraphCompiler"));
   CompilationContext& context = graph_compiler_->GetCompilationContext();
   context.groups.emplace_back(input.task->subgraph);
   context.lowered_funcs.emplace_back(input.lowered_funcs);
diff --git a/paddle/cinn/auto_schedule/measure/simple_runner.cc b/paddle/cinn/auto_schedule/measure/simple_runner.cc
index 92dcc00693b5b..ec3929aff71ae 100644
--- a/paddle/cinn/auto_schedule/measure/simple_runner.cc
+++ b/paddle/cinn/auto_schedule/measure/simple_runner.cc
@@ -25,7 +25,7 @@
 #include "paddle/cinn/hlir/framework/buffer.h"
 #include "paddle/cinn/hlir/framework/scope.h"
 #include "paddle/cinn/hlir/framework/tensor.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace auto_schedule {
 
@@ -76,8 +76,11 @@ static void PopulateRandomValue(const cinn::common::Type& type,
     std::generate_n(
         fmt_ptr, numel, [&engine, &dist]() { return dist(engine); });
   } else {
-    CHECK_EQ(type.bytes(), 8)
-        << "Unsupported type: " << type << ", type.bytes = " << type.bytes();
+    PADDLE_ENFORCE_EQ(
+        type.bytes(),
+        8,
+        phi::errors::Unimplemented("Unsupported type, the type.bytes is %d",
+                                   type.bytes()));
     auto* fmt_ptr = reinterpret_cast<uint8_t*>(raw_ptr);
     std::uniform_int_distribution<uint8_t> dist(
         std::numeric_limits<uint8_t>::min(),
@@ -127,7 +130,12 @@ static std::unordered_set<std::string> ParamsNeedInitWithZero(
       std::vector<int> param_idxs = kInitWithZeroParams.at(node->op()->name);
       const auto& inlinks = node->inlinks_in_order();
       for (int param_idx : param_idxs) {
-        CHECK_GT(inlinks.size(), param_idx);
+        PADDLE_ENFORCE_GT(inlinks.size(),
+                          param_idx,
+                          phi::errors::InvalidArgument(
+                              "The input size of the node is less than the "
+                              "index of the parameter that needs to be "
+                              "initialized to 0"));
         auto& edge = inlinks.at(param_idx);
         std::string param_name =
             edge->source()->as<hlir::framework::NodeData>()->id();
@@ -141,7 +149,10 @@ static std::unordered_set<std::string> ParamsNeedInitWithZero(
 }
 
 SimpleRunner::SimpleRunner(int repeat_times) : repeat_times_(repeat_times) {
-  CHECK_GT(repeat_times_, 0) << "repeat_times can't less than 0";
+  PADDLE_ENFORCE_GT(
+      repeat_times_,
+      0,
+      phi::errors::InvalidArgument("repeat_times should be greater than 0"));
 }
 
 // Prepare execution arguments of all instructions to run, a argument
diff --git a/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process.cc b/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process.cc
index 2e3c4b0e21661..ffc8a0f21d903 100644
--- a/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process.cc
+++ b/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process.cc
@@ -18,7 +18,7 @@
 #include "paddle/cinn/ir/ir_printer.h"
 #include "paddle/cinn/ir/schedule/ir_schedule.h"
 #include "paddle/cinn/ir/schedule/schedule_desc.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace auto_schedule {
 
@@ -29,7 +29,10 @@ int ExtractNumThreads(const ir::IRSchedule& ir_schedule,
     if (step.type == "Bind" &&
         step.attrs.find("thread_axis") != step.attrs.end() &&
         absl::get<std::string>(step.attrs.at("thread_axis")) == bind_axis) {
-      CHECK_EQ(step.inputs.at("loop").size(), 1);
+      PADDLE_ENFORCE_EQ(step.inputs.at("loop").size(),
+                        1,
+                        phi::errors::InvalidArgument(
+                            "The loop size of bind step should be 1"));
       return step.inputs.at("loop")[0].As<ir::For>()->extent.as_int32();
     }
   }
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc
index e59ba8b423293..523763942c64e 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc
@@ -21,7 +21,7 @@
 #include "paddle/cinn/ir/schedule_block_graph.h"
 #include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/ir/utils/ir_nodes_collector.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace auto_schedule {
 
@@ -40,8 +40,11 @@ bool IsSpatialLoop(const ir::For* for_node) {
         const auto* schedule_block =
             block_realize->schedule_block.As<ir::ScheduleBlock>();
         CHECK(schedule_block) << "schedule_block field is not a ScheduleBlock";
-        CHECK_EQ(block_realize->iter_values.size(),
-                 schedule_block->iter_vars.size());
+        PADDLE_ENFORCE_EQ(
+            block_realize->iter_values.size(),
+            schedule_block->iter_vars.size(),
+            phi::errors::InvalidArgument(
+                "The size of iter_values and iter_vars should be equal."));
         for (int i = 0; i < block_realize->iter_values.size(); ++i) {
           const ir::Var& iter_var = schedule_block->iter_vars[i];
           const ir::Expr& binding = block_realize->iter_values[i];
@@ -93,10 +96,16 @@ void BindGPUIndex(ir::IRSchedule* ir_schedule,
                   int max_blocks,
                   int max_threads_per_block) {
   auto all_loops = ir_schedule->GetLoops(block_name);
-  CHECK_LE(num_loops_to_bind, all_loops.size())
-      << "The number of loops to be bind is greater than size of all_loops";
-  CHECK_GE(num_loops_to_bind, 0)
-      << "The number of loops to be bind should be greater than 0";
+  PADDLE_ENFORCE_LE(
+      num_loops_to_bind,
+      all_loops.size(),
+      phi::errors::InvalidArgument(
+          "The number of loops to be bind is greater than size of all_loops"));
+  PADDLE_ENFORCE_GE(
+      num_loops_to_bind,
+      0,
+      phi::errors::InvalidArgument(
+          "The number of loops to be bind should be greater than 0"));
   // check whether it is the case that threadIdx has been binded but blockIdx
   // not, the threadIdx can only be binded in the first loop after
   // num_loops_to_bind loops because we has excluded other cases in
@@ -130,13 +139,19 @@ void BindGPUIndex(ir::IRSchedule* ir_schedule,
 
   if (extent <= max_blocks * max_threads_per_block) {
     auto splits = ir_schedule->Split(fused_loop, {-1, max_threads_per_block});
-    CHECK_EQ(splits.size(), 2);
+    PADDLE_ENFORCE_EQ(
+        splits.size(),
+        2,
+        phi::errors::InvalidArgument("The size of splits should be 2."));
     ir_schedule->Bind(splits[0], "blockIdx.x");
     ir_schedule->Bind(splits[1], "threadIdx.x");
   } else {
     auto splits =
         ir_schedule->Split(fused_loop, {-1, max_blocks, max_threads_per_block});
-    CHECK_EQ(splits.size(), 3);
+    PADDLE_ENFORCE_EQ(
+        splits.size(),
+        3,
+        phi::errors::InvalidArgument("The size of splits should be 3."));
     ir_schedule->Reorder({splits[1], splits[2], splits[0]});
     all_loops = ir_schedule->GetLoops(block_name);
     ir_schedule->Bind(all_loops[0], "blockIdx.x");
@@ -160,8 +175,11 @@ RuleApplyType AutoBind::Init(ir::IRSchedule* ir_schedule) {
 }
 
 void AutoBind::Apply(int index) {
-  CHECK_LT(index, applicable_schedule_blocks_.size())
-      << "invalid apply index:" << index;
+  PADDLE_ENFORCE_LT(
+      index,
+      applicable_schedule_blocks_.size(),
+      phi::errors::InvalidArgument(
+          "The index should be less than size of applicable_schedule_blocks_"));
   auto applied_block = applicable_schedule_blocks_.at(index);
   auto all_loops = ir_schedule_->GetLoops(applied_block);
   BindGPUIndex(ir_schedule_,
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.cc
index e52d91c125224..ef0dbef492a59 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.cc
@@ -20,7 +20,7 @@
 
 #include "paddle/cinn/common/target.h"
 #include "paddle/cinn/ir/schedule/ir_schedule.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace auto_schedule {
 
@@ -28,16 +28,19 @@ AutoGenRule::AutoGenRule(const cinn::common::Target& target)
     : target_(&target) {}
 
 int AutoGenRule::NumberApplicable() const {
-  CHECK_GE(num_applicable_, 0)
-      << "Call " << GetRuleName()
-      << "::NumberApplicable() without initialization.";
+  PADDLE_ENFORCE_GE(
+      num_applicable_,
+      0,
+      phi::errors::InvalidArgument(
+          "The num_applicable_ should be greater than or equal to 0."));
   return num_applicable_;
 }
 
 void AutoGenRule::ApplyRandomly() {
-  CHECK_GT(num_applicable_, 0)
-      << "Call " << GetRuleName()
-      << "::ApplyRandomly() with NumberApplicable() == 0";
+  PADDLE_ENFORCE_GT(num_applicable_,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The num_applicable_ should be greater than 0."));
   int index = rand() % num_applicable_;  // NOLINT
   return Apply(index);
 }
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.cc
index c052d2995c8ad..a4ecd5036e2e7 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.cc
@@ -22,7 +22,7 @@
 #include "paddle/cinn/ir/schedule/ir_schedule.h"
 #include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/ir/utils/ir_nodes_collector.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace auto_schedule {
 
@@ -97,8 +97,9 @@ RuleApplyType AutoUnroll::Init(ir::IRSchedule* ir_schedule) {
 }
 
 void AutoUnroll::Apply(int index) {
-  CHECK_LT(index, applicable_schedule_blocks_.size())
-      << "invalid apply index:" << index;
+  PADDLE_ENFORCE_LT(index,
+                    applicable_schedule_blocks_.size(),
+                    phi::errors::InvalidArgument("Index is out of range."));
   auto applied_block = applicable_schedule_blocks_.at(index);
   int max_step = auto_unroll_options[std::rand() % auto_unroll_options.size()];
   ir_schedule_->Annotate(
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h
index 1bbc8da4497d6..759dbfa54d3a4 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h
@@ -27,7 +27,7 @@
 #include "paddle/cinn/ir/ir.h"
 #include "paddle/cinn/ir/ir_base.h"
 #include "paddle/cinn/ir/schedule/ir_schedule.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace auto_schedule {
 
@@ -103,8 +103,11 @@ class MultiLevelTiling : public AutoGenRule {
   // Sample num_split integers whose product equals extent
   template <typename T>
   std::vector<T> SampleTileSplit(T extent, int num_split) const {
-    CHECK_GT(num_split, 0)
-        << "num_split in SampleTileSplit must be greater than 0";
+    PADDLE_ENFORCE_GT(
+        num_split,
+        0,
+        phi::errors::InvalidArgument(
+            "num_split in SampleTileSplit must be greater than 0"));
     if (num_split == 1) {
       return {extent};
     }
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring.cc
index 85bc207c84fc7..0053c87a81394 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring.cc
@@ -23,7 +23,7 @@
 #include "paddle/cinn/ir/tensor.h"
 #include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/ir/utils/ir_nodes_collector.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace auto_schedule {
 
@@ -32,10 +32,16 @@ bool ReductionFactoring::CanApply(const std::string& block_name,
   ir::Expr block_expr = ir_schedule->GetBlock(block_name);
   ir::ScheduleBlockRealize* block_realize =
       block_expr.As<ir::ScheduleBlockRealize>();
-  CHECK_NOTNULL(block_realize);
+  PADDLE_ENFORCE_NOT_NULL(
+      block_realize,
+      phi::errors::InvalidArgument(
+          "The block_expr should be a ScheduleBlockRealize."));
   ir::ScheduleBlock* sch_block =
       block_realize->schedule_block.As<ir::ScheduleBlock>();
-  CHECK_NOTNULL(sch_block);
+  PADDLE_ENFORCE_NOT_NULL(
+      sch_block,
+      phi::errors::InvalidArgument(
+          "The schedule_block field is not a ScheduleBlock."));
   AnalyzeScheduleBlockReadWriteBuffer(sch_block);
 
   // 1. The block must have write buffer
@@ -135,7 +141,11 @@ void ReductionFactoring::Apply(const std::string& block_name,
     return;
   }
   // 3. Reorder if new_loop_order differs from the original order
-  CHECK_EQ(all_loops.size(), new_loop_order.size());
+  PADDLE_ENFORCE_EQ(
+      all_loops.size(),
+      new_loop_order.size(),
+      phi::errors::InvalidArgument("The size of all_loops should be equal to "
+                                   "the size of new_loop_order."));
   for (int i = 0; i < all_loops.size(); ++i) {
     if (all_loops[i].As<ir::For>()->loop_var->name !=
         new_loop_order[i].As<ir::For>()->loop_var->name) {
@@ -152,7 +162,11 @@ void ReductionFactoring::Apply(const std::string& block_name,
     for (int i = num_spatial_loops; i < all_loops.size(); ++i) {
       reduction_loop_indices.push_back(i);
     }
-    CHECK_EQ(reduction_loop_indices.size(), num_reduction_loops);
+    PADDLE_ENFORCE_EQ(reduction_loop_indices.size(),
+                      num_reduction_loops,
+                      phi::errors::InvalidArgument(
+                          "The size of reduction_loop_indices should be equal "
+                          "to num_reduction_loops."));
     fused_reduce_loop = ir_schedule->Fuse(block_name, reduction_loop_indices);
   } else {
     all_loops = ir_schedule->GetLoops(block_name);
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring_test.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring_test.cc
index d56d97f83df60..fb327c130dbbf 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring_test.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring_test.cc
@@ -23,8 +23,8 @@
 
 #include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.h"
 #include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/common/enforce.h"
 #include "test/cpp/cinn/concrete_program_builder.h"
-
 PD_DECLARE_bool(cinn_new_group_scheduler);
 
 namespace cinn {
@@ -64,8 +64,13 @@ class TestReductionFactoring : public TestAutoGenRuleBase {
 
     // check
     const std::vector<ir::Expr>& blocks = ir_schedule.GetAllBlocks();
-    CHECK_EQ(blocks.size(), 2UL);
-    CHECK_EQ(ir.str(), expected_ir);
+    PADDLE_ENFORCE_EQ(
+        blocks.size(),
+        2UL,
+        phi::errors::InvalidArgument("The size of blocks should be 2."));
+    PADDLE_ENFORCE_EQ(ir.str(),
+                      expected_ir,
+                      phi::errors::InvalidArgument("The ir is not correct."));
   }
 };
 
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.cc
index 994027dba0ee4..66d25c65542d1 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.cc
@@ -18,7 +18,6 @@
 #include <gtest/gtest.h>
 #include <memory.h>
 #include <stdlib.h>
-
 #include "paddle/cinn/auto_schedule/analysis/analyze_ir.h"
 #include "paddle/cinn/backends/codegen_cuda_dev.h"
 #include "paddle/cinn/cinn.h"
@@ -29,6 +28,7 @@
 #include "paddle/cinn/hlir/framework/pass.h"
 #include "paddle/cinn/hlir/framework/tensor.h"
 #include "paddle/cinn/optim/transform_gpu_forloop.h"
+#include "paddle/common/enforce.h"
 #ifdef CINN_WITH_CUDA
 #include <cuda_runtime.h>
 #endif
@@ -89,8 +89,10 @@ std::string TestAutoGenRuleBase::GetIR(const ir::IRSchedule& schedule) {
 
 ir::Module TestAutoGenRuleBase::BuildIRModule(const ir::IRSchedule& schedule) {
   auto&& updated_bodys = schedule.GetModule().GetExprs();
-  CHECK_EQ(lowered_funcs_.size(), updated_bodys.size())
-      << "associated exprs size not equal";
+  PADDLE_ENFORCE_EQ(
+      lowered_funcs_.size(),
+      updated_bodys.size(),
+      phi::errors::InvalidArgument("Associated exprs size not equal"));
 
   ir::Module::Builder builder("test_builder", this->target_);
   for (int i = 0; i < lowered_funcs_.size(); ++i) {
@@ -175,10 +177,16 @@ void CheckResult(raw_func_type test_func,
                  const cinn::common::Target& target) {
   CHECK(input_names.size()) << "The number of inputs must be greater than 0.";
   CHECK(output_names.size()) << "The number of outputs must be greater than 0.";
-  CHECK_EQ(input_names.size(), input_shapes.size())
-      << "The quantity of input_names and input_shapes must be equal.";
-  CHECK_EQ(output_names.size(), output_shapes.size())
-      << "The quantity of output_names and output_shapes must be equal.";
+  PADDLE_ENFORCE_EQ(
+      input_names.size(),
+      input_shapes.size(),
+      phi::errors::InvalidArgument(
+          "The quantity of input_names and input_shapes must be equal."));
+  PADDLE_ENFORCE_EQ(
+      output_names.size(),
+      output_shapes.size(),
+      phi::errors::InvalidArgument(
+          "The quantity of output_names and output_shapes must be equal."));
 
   // Initialize data
   std::vector<float*> input_data_ptrs(input_names.size());
diff --git a/paddle/cinn/auto_schedule/search_space/block_sampler.cc b/paddle/cinn/auto_schedule/search_space/block_sampler.cc
index 93de31e6a5e36..38d3b7badd02a 100644
--- a/paddle/cinn/auto_schedule/search_space/block_sampler.cc
+++ b/paddle/cinn/auto_schedule/search_space/block_sampler.cc
@@ -17,7 +17,7 @@
 #include <algorithm>
 
 #include "paddle/cinn/ir/ir.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace auto_schedule {
 
@@ -27,7 +27,10 @@ std::unique_ptr<BlockSampler> BlockSampler::Make(
     const std::string& strategy,
     utils::LinearRandomEngine::StateType rand_seed,
     const std::vector<int>& weights) {
-  CHECK_GT(all_blocks.size(), 0) << "Empty block list";
+  PADDLE_ENFORCE_GT(
+      all_blocks.size(),
+      0,
+      phi::errors::InvalidArgument("The all_blocks should not empty."));
   if (strategy == "traversal") {
     VLOG(6) << "Init TraversalBlockSampler with block num = "
             << all_blocks.size();
@@ -87,7 +90,11 @@ ProbabilisticBlockSampler::ProbabilisticBlockSampler(
   if (weights.empty()) {
     weights_.resize(all_blocks.size(), 1);
   } else {
-    CHECK_EQ(all_blocks.size(), weights_.size());
+    PADDLE_ENFORCE_EQ(
+        all_blocks.size(),
+        weights_.size(),
+        phi::errors::InvalidArgument(
+            "The size of all_blocks and weights should be equal."));
   }
   remains_ = all_blocks.size();
 }
diff --git a/paddle/cinn/auto_schedule/search_space/rule_sampler.cc b/paddle/cinn/auto_schedule/search_space/rule_sampler.cc
index 3c0868d0748e5..bd8e818546a91 100644
--- a/paddle/cinn/auto_schedule/search_space/rule_sampler.cc
+++ b/paddle/cinn/auto_schedule/search_space/rule_sampler.cc
@@ -16,7 +16,7 @@
 
 #include <algorithm>
 #include <random>
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace auto_schedule {
 
@@ -26,7 +26,10 @@ std::unique_ptr<RuleSampler> RuleSampler::Make(
     const std::string& strategy,
     utils::LinearRandomEngine::StateType rand_seed,
     const std::vector<int>& weights) {
-  CHECK_GT(potential_rules.size(), 0) << "Empty rule list";
+  PADDLE_ENFORCE_GT(
+      potential_rules.size(),
+      0,
+      phi::errors::InvalidArgument("The potential_rules should not be empty."));
   if (strategy == "traversal") {
     return std::make_unique<TraversalRuleSampler>(potential_rules,
                                                   default_remove_policy);
@@ -64,7 +67,11 @@ ProbabilisticRuleSampler::ProbabilisticRuleSampler(
   if (weights.empty()) {
     weights_.resize(potential_rules.size(), 1);
   } else {
-    CHECK_EQ(potential_rules.size(), weights_.size());
+    PADDLE_ENFORCE_EQ(
+        potential_rules.size(),
+        weights_.size(),
+        phi::errors::InvalidArgument(
+            "Potential_rules's size should same as weights's size."));
   }
   remains_ = potential_rules.size();
 }
diff --git a/paddle/cinn/auto_schedule/search_space/search_space.cc b/paddle/cinn/auto_schedule/search_space/search_space.cc
index 650e1d572f831..a4f4db6472e1b 100644
--- a/paddle/cinn/auto_schedule/search_space/search_space.cc
+++ b/paddle/cinn/auto_schedule/search_space/search_space.cc
@@ -33,7 +33,7 @@
 #include "paddle/cinn/ir/schedule/ir_schedule.h"
 #include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/runtime/flags.h"
-
+#include "paddle/common/enforce.h"
 PD_DECLARE_bool(auto_schedule_use_cost_model);
 
 namespace cinn {
@@ -109,7 +109,10 @@ SearchState SearchSpace::RandomScheduleMutate(const SearchState& state) {
   --iter;
 
   int sample_rule_index = iter->second;
-  CHECK_LT(sample_rule_index, ret->applicable_rules.size());
+  PADDLE_ENFORCE_LT(sample_rule_index,
+                    ret->applicable_rules.size(),
+                    phi::errors::InvalidArgument(
+                        "The sample_rule_index should less than ret's."));
   AutoGenRule* sample_rule = ret->applicable_rules.at(sample_rule_index);
   VLOG(7) << "Apply rule: " << sample_rule->GetRuleName()
           << " with index=" << sample_weighted_index - iter->first;
diff --git a/paddle/cinn/auto_schedule/search_strategy/evolutionary_search.cc b/paddle/cinn/auto_schedule/search_strategy/evolutionary_search.cc
index dcb6e1ca93914..6403283f18be1 100644
--- a/paddle/cinn/auto_schedule/search_strategy/evolutionary_search.cc
+++ b/paddle/cinn/auto_schedule/search_strategy/evolutionary_search.cc
@@ -35,7 +35,7 @@
 #include "paddle/cinn/utils/multi_threading.h"
 #include "paddle/cinn/utils/sized_multi_set.h"
 #include "paddle/cinn/utils/string.h"
-
+#include "paddle/common/enforce.h"
 PD_DECLARE_bool(auto_schedule_use_cost_model);
 
 namespace cinn {
@@ -175,9 +175,11 @@ SearchState EvolutionarySearch::CrossOver(const SearchState& state1,
   std::vector<ir::Expr> mother_exprs =
       state2->ir_schedule.GetModule().GetExprs();
 
-  CHECK_EQ(father_exprs.size(), mother_exprs.size())
-      << "CrossOver ModuleExpr in EvolutionarySearch must have same number of "
-         "AST";
+  PADDLE_ENFORCE_EQ(father_exprs.size(),
+                    mother_exprs.size(),
+                    phi::errors::InvalidArgument(
+                        "CrossOver ModuleExpr in EvolutionarySearch must have "
+                        "same number of AST"));
 
   for (size_t i = 0; i < father_exprs.size(); ++i) {
     if (utils::SampleUniformInt(0, 2, &rand_seed_) == 0) {
@@ -200,10 +202,15 @@ SearchState EvolutionarySearch::CrossOver(const SearchState& state1,
 
 SearchState EvolutionarySearch::Mutate(
     const SearchState& state, utils::LinearRandomEngine::StateType* rand_seed) {
-  CHECK_GT(weighted_mutators_.size(), 0)
-      << "There is no mutate rule can be applied.";
+  PADDLE_ENFORCE_GT(
+      weighted_mutators_.size(),
+      0,
+      phi::errors::InvalidArgument("There is no mutate rule can be applied."));
   double accu_weight = (weighted_mutators_.rbegin())->first;
-  CHECK_GT(accu_weight, 0) << "The accumulate weight must be greater than 0.";
+  PADDLE_ENFORCE_GT(accu_weight,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The accumulate weight must be greater than 0."));
   // sample a mutate rule
   double sample_weight = utils::SampleUniformDouble(0, accu_weight, rand_seed);
   auto sampled_iter = weighted_mutators_.upper_bound(sample_weight);
diff --git a/paddle/cinn/auto_schedule/search_strategy/evolutionary_search_test.cc b/paddle/cinn/auto_schedule/search_strategy/evolutionary_search_test.cc
index 6a983d7f9aaac..7791cdf9f89d5 100644
--- a/paddle/cinn/auto_schedule/search_strategy/evolutionary_search_test.cc
+++ b/paddle/cinn/auto_schedule/search_strategy/evolutionary_search_test.cc
@@ -30,8 +30,8 @@
 #include "paddle/cinn/hlir/framework/op_lowering.h"
 #include "paddle/cinn/ir/ir_base.h"
 #include "paddle/cinn/ir/schedule/ir_schedule.h"
+#include "paddle/common/enforce.h"
 #include "test/cpp/cinn/program_builder.h"
-
 namespace cinn {
 namespace auto_schedule {
 
@@ -159,7 +159,10 @@ TEST(EvolutionarySearch, Evolve) {
   auto tasks = CreateTasks(
       tests::OpBuilder("matmul").Build({{"X", {32, 32}}, {"Y", {32, 32}}}),
       target);
-  CHECK_EQ(tasks.size(), 1);
+  PADDLE_ENFORCE_EQ(
+      tasks.size(),
+      1,
+      phi::errors::InvalidArgument("The size of tasks should be 1."));
   ExprCostModel cost_model;
   std::vector<const ir::ModuleExpr*> cost_model_samples(1);
   std::vector<float> cost_model_labels(1);
@@ -206,7 +209,11 @@ TEST(EvolutionarySearch, Evolve) {
       VLOG(6) << "cost = " << s->predicted_cost;
     }
     VLOG(6) << "total_cost_next = " << total_cost_next;
-    CHECK_LE(total_cost_next, total_cost_pre);
+    PADDLE_ENFORCE_LE(
+        total_cost_next,
+        total_cost_pre,
+        phi::errors::InvalidArgument("The total cost should be less than or "
+                                     "equal to the previous one."));
     std::swap(population_pre_ptr, population_next_ptr);
   }
 }
diff --git a/paddle/cinn/auto_schedule/task/task_optimizer.cc b/paddle/cinn/auto_schedule/task/task_optimizer.cc
index 273cba4c4060e..a027dc9dd1ed5 100644
--- a/paddle/cinn/auto_schedule/task/task_optimizer.cc
+++ b/paddle/cinn/auto_schedule/task/task_optimizer.cc
@@ -18,7 +18,6 @@
 
 #include <functional>
 #include <limits>
-
 #include "paddle/cinn/auto_schedule/analysis/analyze_ir.h"
 #include "paddle/cinn/auto_schedule/cost_model/expr_cost_model.h"
 #include "paddle/cinn/auto_schedule/measure/measure.h"
@@ -34,6 +33,7 @@
 #include "paddle/cinn/optim/transform_gpu_forloop.h"
 #include "paddle/cinn/runtime/flags.h"
 #include "paddle/cinn/utils/string.h"
+#include "paddle/common/enforce.h"
 #ifdef CINN_WITH_CUDA
 #include <cuda_runtime_api.h>
 
@@ -223,9 +223,12 @@ bool IsWrappedByCustomCall(const TuneTask* task) {
 
 TaskOptimizer::Result TaskOptimizer::OptimizeByEvolution(
     const TuningOptions& options) {
-  CHECK_EQ(options.num_measure_trials % options.num_samples_per_iteration, 0)
-      << "TuningOptions.num_measure_trials % "
-         "TuningOptions.num_samples_per_iteration must be 0.";
+  PADDLE_ENFORCE_EQ(
+      options.num_measure_trials % options.num_samples_per_iteration,
+      0,
+      phi::errors::InvalidArgument(
+          "TuningOptions.num_measure_trials % "
+          "TuningOptions.num_samples_per_iteration must be 0."));
 
   VLOG(4) << "Optimizing TuneTask with num_measure_trials:"
           << options.num_measure_trials
@@ -290,9 +293,11 @@ TaskOptimizer::Result TaskOptimizer::OptimizeByEvolution(
             << measure_inputs.size();
     std::vector<MeasureResult> measure_outputs =
         schedule_measurer_->Measure(measure_inputs);
-    CHECK_EQ(measure_outputs.size(), states.size())
-        << "ScheduleMeasurer didn't output same number of MeasureOutput of "
-           "states in TaskOptimizer";
+    PADDLE_ENFORCE_EQ(measure_outputs.size(),
+                      states.size(),
+                      phi::errors::InvalidArgument(
+                          "ScheduleMeasurer didn't output same number of "
+                          "MeasureOutput of states in TaskOptimizer"));
     // record to database
     for (size_t i = 0; i < states.size(); ++i) {
       database_->AddRecord(TuningRecord(measure_inputs[i].task->serialized_key,
@@ -344,9 +349,11 @@ std::vector<SearchState> TaskOptimizer::SearchOneRound(
   for (size_t i = 0; i < states.size(); ++i) {
     std::vector<ir::Expr> best_exprs =
         states[i]->ir_schedule.GetModule().GetExprs();
-    CHECK_EQ(best_exprs.size(), task_->lowered_funcs.size())
-        << "RuntimeError: Expr size is not equal to LoweredFunc size in "
-           "TaskOptimizer";
+    PADDLE_ENFORCE_EQ(best_exprs.size(),
+                      task_->lowered_funcs.size(),
+                      phi::errors::InvalidArgument(
+                          "Expr size is not equal to LoweredFunc size in "
+                          "TaskOptimizer"));
     auto init_funcs = ir::ir_utils::IRCopy(task_->lowered_funcs);
     std::vector<ir::LoweredFunc> valid_funcs;
     for (size_t j = 0; j < best_exprs.size(); ++j) {
@@ -369,8 +376,11 @@ std::vector<SearchState> TaskOptimizer::SearchOneRound(
   }
 
   states.erase(states.begin() + valid_cnt, states.end());
-  CHECK_EQ(states.size(), measure_candidates->size())
-      << "result size of states not equal to measure_candidates";
+  PADDLE_ENFORCE_EQ(
+      states.size(),
+      measure_candidates->size(),
+      phi::errors::InvalidArgument(
+          "result size of states not equal to measure_candidates"));
   VLOG(4) << "EvolutionarySearch return size=" << states.size()
           << ", valid count=" << valid_cnt;
   VLOG(4) << JoinStatesDebugString("TaskOptimizer::SearchOneRound-Result",
diff --git a/paddle/cinn/auto_schedule/task_scheduler/task_scheduler.cc b/paddle/cinn/auto_schedule/task_scheduler/task_scheduler.cc
index a8961e45b980d..f59acbe612635 100644
--- a/paddle/cinn/auto_schedule/task_scheduler/task_scheduler.cc
+++ b/paddle/cinn/auto_schedule/task_scheduler/task_scheduler.cc
@@ -19,7 +19,7 @@
 #include "paddle/cinn/auto_schedule/task/tune_task.h"
 #include "paddle/cinn/auto_schedule/task_scheduler/efficiency_priority.h"
 #include "paddle/cinn/auto_schedule/task_scheduler/round_robin.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace auto_schedule {
 
@@ -27,7 +27,10 @@ std::unique_ptr<TaskScheduler> TaskScheduler::Make(
     const std::vector<TuneTask>& tasks,
     const Config& config,
     const std::string& strategy) {
-  CHECK_GT(tasks.size(), 0) << "Empty task list";
+  PADDLE_ENFORCE_GT(
+      tasks.size(),
+      0,
+      phi::errors::InvalidArgument("The task's size should greater than 0."));
   if (strategy == "round_robin") {
     return std::make_unique<RoundRobin>(tasks, config);
   } else if (strategy == "efficiency_priority") {
diff --git a/paddle/cinn/auto_schedule/tests/performance_comparison_test.cc b/paddle/cinn/auto_schedule/tests/performance_comparison_test.cc
index 2966467b3eda6..c9f2630ac6e8a 100644
--- a/paddle/cinn/auto_schedule/tests/performance_comparison_test.cc
+++ b/paddle/cinn/auto_schedule/tests/performance_comparison_test.cc
@@ -32,8 +32,8 @@
 #include "paddle/cinn/ir/ir_base.h"
 #include "paddle/cinn/runtime/flags.h"
 #include "paddle/cinn/utils/data_util.h"
+#include "paddle/common/enforce.h"
 #include "test/cpp/cinn/program_builder.h"
-
 /* This test is used as a tool to evaluate or compare performance of 3
  * schedules(no schedule, manual schedule, auto-schedule). One can specify which
  * schedules to be evaluated through `FLAGS_evaluate_knobs` and specify which
@@ -355,7 +355,10 @@ TEST_F(PerformanceTester, Gather) {
 
 // paddle model test
 TEST_F(PerformanceTester, ResNet50) {
-  CHECK_NE(FLAGS_resnet50_model_dir, "");
+  PADDLE_ENFORCE_NE(FLAGS_resnet50_model_dir,
+                    "",
+                    phi::errors::InvalidArgument(
+                        "The FLAGS_resnet50_model's dir should not be empty."));
   FLAGS_cinn_infer_model_version = 1.0;
   std::unordered_map<std::string, std::vector<int64_t>> feeds = {
       {"inputs", {batch_size, 3, 224, 224}}};
diff --git a/paddle/cinn/backends/codegen_c.cc b/paddle/cinn/backends/codegen_c.cc
index 85443b02c0a8c..07dc8421de6cc 100644
--- a/paddle/cinn/backends/codegen_c.cc
+++ b/paddle/cinn/backends/codegen_c.cc
@@ -26,7 +26,7 @@
 #include "paddle/cinn/runtime/cpu/thread_backend.h"
 #include "paddle/cinn/runtime/intrinsic.h"
 #include "paddle/cinn/utils/string.h"
-
+#include "paddle/common/enforce.h"
 //! Root of the builtin code.
 PD_DECLARE_string(cinn_x86_builtin_code_root);
 
@@ -205,7 +205,10 @@ void CodeGenC::Visit(const ir::For *op) {
     Expr num_task_var = Var("num_task");
     IrPrinter::Visit((op->extent + num_task_var - 1) / num_task_var);
     str_ += ";\n";
-    CHECK_EQ(min.as_int32(), 0);
+    PADDLE_ENFORCE_EQ(
+        min.as_int32(),
+        0,
+        phi::errors::InvalidArgument("The min of the for loop should be 0"));
     auto task_id = Var("task_id");
     auto n_per_task = Var("n_per_task");
     min = task_id * n_per_task;
@@ -370,7 +373,10 @@ void CodeGenC::PrintCallArgs(const ir::Call *op) {
 }
 
 void CodeGenC::PrintCall_buffer_malloc(const ir::Call *op) {
-  CHECK_EQ(op->read_args.size(), 2UL);
+  PADDLE_ENFORCE_EQ(
+      op->read_args.size(),
+      2UL,
+      phi::errors::InvalidArgument("The number of read_args should be 2"));
   str_ += op->name;
   str_ += "(";
   PrintCastExpr("void*", op->read_args[0]);
@@ -380,7 +386,10 @@ void CodeGenC::PrintCall_buffer_malloc(const ir::Call *op) {
 }
 
 void CodeGenC::PrintCall_cinn_pod_value_to_(const ir::Call *op) {
-  CHECK_EQ(op->read_args.size(), 1UL);
+  PADDLE_ENFORCE_EQ(
+      op->read_args.size(),
+      1UL,
+      phi::errors::InvalidArgument("The number of read_args should be 1"));
   str_ += op->name;
   str_ += "(";
   str_ += "&(";
@@ -390,7 +399,10 @@ void CodeGenC::PrintCall_cinn_pod_value_to_(const ir::Call *op) {
 }
 
 void CodeGenC::PrintCall_get_address(const ir::Call *op) {
-  CHECK_EQ(op->read_args.size(), 1UL);
+  PADDLE_ENFORCE_EQ(
+      op->read_args.size(),
+      1UL,
+      phi::errors::InvalidArgument("The number of read_args should be 1"));
   CHECK(op->write_args.empty());
   auto *read_var = op->read_args.front().as_var();
   auto *read_buf = op->read_args.front().as_buffer();
@@ -409,7 +421,10 @@ void CodeGenC::PrintCall_get_address(const ir::Call *op) {
 
 void CodeGenC::PrintCall_pod_values_to_array(const ir::Call *op) {
   CHECK(!op->read_args.empty());
-  CHECK_EQ(op->write_args.size(), 1UL);
+  PADDLE_ENFORCE_EQ(
+      op->write_args.size(),
+      1UL,
+      phi::errors::InvalidArgument("The number of write_args should be 1"));
   auto output_var = op->write_args.front().as_var_ref();
   CHECK(output_var.defined());
 
@@ -612,9 +627,12 @@ void CodeGenC::Visit(const ir::_LoweredFunc_ *op) {
 
   DoIndent();
 
-  CHECK_EQ(op->alloc_output_buffer_exprs.size(),
-           op->dealloc_output_buffer_exprs.size())
-      << "the count of allocation and deallocation expressions is not match";
+  PADDLE_ENFORCE_EQ(
+      op->alloc_output_buffer_exprs.size(),
+      op->dealloc_output_buffer_exprs.size(),
+      phi::errors::InvalidArgument(
+          "The count of allocation and deallocation expressions is not "
+          "match"));
 
   std::vector<Expr> new_body;
 
diff --git a/paddle/cinn/backends/codegen_c_x86.cc b/paddle/cinn/backends/codegen_c_x86.cc
index 394b61e35816d..06a9ff1fda2f9 100644
--- a/paddle/cinn/backends/codegen_c_x86.cc
+++ b/paddle/cinn/backends/codegen_c_x86.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/cinn/backends/codegen_c_x86.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace backends {
 
@@ -53,7 +53,11 @@ void CodeGenCX86::Visit(const ir::Load *op) {
 }
 
 void CodeGenCX86::Visit(const ir::Broadcast *op) {
-  CHECK_GT(op->type().lanes(), 1);
+  PADDLE_ENFORCE_GT(
+      op->type().lanes(),
+      1,
+      phi::errors::InvalidArgument(
+          "The lanes of the broadcast op should be greater than 1."));
   int bits = op->type().bits() * op->type().lanes();
 
   if (SupportsAVX512() && bits == 512) {
diff --git a/paddle/cinn/backends/codegen_c_x86.h b/paddle/cinn/backends/codegen_c_x86.h
index f0b040a94f1ae..bf90612292d20 100644
--- a/paddle/cinn/backends/codegen_c_x86.h
+++ b/paddle/cinn/backends/codegen_c_x86.h
@@ -18,7 +18,7 @@
 
 #include "paddle/cinn/backends/codegen_c.h"
 #include "paddle/cinn/ir/intrinsic_ops.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace backends {
 
@@ -114,8 +114,10 @@ void CodeGenCX86::VisitBinaryOp(const Op *op,
                                 Expr a,
                                 Expr b,
                                 const std::string &op_repr) {
-  CHECK_EQ(a.type(), b.type()) << " a is : " << a << ", and b is : " << b
-                               << ". op_repr is : " << op_repr;
+  PADDLE_ENFORCE_EQ(
+      a.type(),
+      b.type(),
+      phi::errors::InvalidArgument("The type of a and b should be the same."));
 
   // scalar.
   if (a.type().lanes() == 1) {
diff --git a/paddle/cinn/backends/codegen_cuda_dev.cc b/paddle/cinn/backends/codegen_cuda_dev.cc
index 9c19c6faffb73..919edfc680ca7 100644
--- a/paddle/cinn/backends/codegen_cuda_dev.cc
+++ b/paddle/cinn/backends/codegen_cuda_dev.cc
@@ -26,8 +26,8 @@
 #include "paddle/cinn/ir/op/ir_operators.h"
 #include "paddle/cinn/ir/utils/ir_verify.h"
 #include "paddle/cinn/optim/ir_simplify.h"
+#include "paddle/common/enforce.h"
 #include "paddle/common/errors.h"
-
 namespace cinn {
 namespace backends {
 
@@ -122,7 +122,8 @@ std::vector<Expr> FilterDeallocTempBuffers(const std::vector<Expr> &frees) {
   std::vector<Expr> filtered;
   for (const Expr &free : frees) {
     const ir::Free *op = free.As<ir::Free>();
-    CHECK_NOTNULL(op);
+    PADDLE_ENFORCE_NOT_NULL(
+        op, phi::errors::InvalidArgument("Free is not a free node"));
     bool has_symbolic_constant = false;
     const ir::_Buffer_ *buffer = op->destination.As<ir::_Buffer_>();
     for (Expr shape : buffer->shape) {
@@ -305,7 +306,10 @@ std::string CodeGenCUDA_Dev::Compile(const ir::Module &module,
 void CodeGenCUDA_Dev::PrintIncludes() { str_ += GetSourceHeader(); }
 
 void CodeGenCUDA_Dev::PrintTempBufferCreation(const ir::Buffer &buffer) {
-  CHECK_NE(buffer->type(), Void());
+  PADDLE_ENFORCE_NE(
+      buffer->type(),
+      Void(),
+      phi::errors::InvalidArgument("buffer type should not be void"));
   // Calculate buffer size and determine if it contains a symbolic constant
   Expr buffer_size(1);
   for (int i = 0; i < buffer->shape.size(); i++) {
diff --git a/paddle/cinn/backends/codegen_cuda_host.cc b/paddle/cinn/backends/codegen_cuda_host.cc
index b888db7c7c726..1ba4714153395 100644
--- a/paddle/cinn/backends/codegen_cuda_host.cc
+++ b/paddle/cinn/backends/codegen_cuda_host.cc
@@ -23,7 +23,7 @@
 #include "paddle/cinn/backends/extern_func_jit_register.h"
 #include "paddle/cinn/backends/llvm/llvm_util.h"
 #include "paddle/cinn/runtime/intrinsic.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace backends {
 
@@ -65,10 +65,22 @@ llvm::Value* CodeGenCUDA_Host::LowerGPUKernelLauncher(
   llvm::Value* kernel_stream = nullptr;
   if (ll_function_args.size() == 3) {
     kernel_stream = ll_function_args[2];
-    CHECK_EQ(kernel_stream->getType(), ll_void_p_ty());  // void* stream
+    PADDLE_ENFORCE_EQ(
+        kernel_stream->getType(),
+        ll_void_p_ty(),
+        phi::errors::InvalidArgument(
+            "The type of kernel_stream should be void*"));  // void* stream
   }
-  CHECK_EQ(kernel_args->getType(), ll_void_p_ty());       // void* args
-  CHECK_EQ(kernel_args_count->getType(), ll_int32_ty());  // int32
+  PADDLE_ENFORCE_EQ(
+      kernel_args->getType(),
+      ll_void_p_ty(),
+      phi::errors::InvalidArgument(
+          "The type of kernel_args should be void*"));  // void* args
+  PADDLE_ENFORCE_EQ(
+      kernel_args_count->getType(),
+      ll_int32_ty(),
+      phi::errors::InvalidArgument(
+          "The type of kernel_args_count should be int32"));  // int32
 
   std::unordered_map<std::string, llvm::Value*> global_args = {
       {KERNEL_ARGS, kernel_args},
@@ -199,7 +211,11 @@ llvm::Value* CodeGenCUDA_Host::LowerHostFunc(const ir::_LoweredFunc_* func) {
   // @}
 
   // Set local scope table
-  CHECK_EQ(ll_function_args.size(), func->args.size());
+  PADDLE_ENFORCE_EQ(ll_function_args.size(),
+                    func->args.size(),
+                    phi::errors::InvalidArgument(
+                        "The number of arguments is not equal to the number of "
+                        "function arguments"));
   for (int i = 0; i < ll_function_args.size(); ++i) {
     SetVar(func->args[i].name(), ll_function_args[i]);
   }
@@ -224,7 +240,11 @@ llvm::Value* CodeGenCUDA_Host::LowerParseArgsValueCall(
     const ir::Call* call_ir) {
   auto ret_type = CinnTypeToLLVMType(Int(64), m_);
   std::vector<llvm::Type*> args_type;
-  CHECK_EQ(call_ir->read_args.size(), 2);
+  PADDLE_ENFORCE_EQ(
+      call_ir->read_args.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "The number of arguments of ParseArgsValue should be 2"));
   CHECK(call_ir->read_args[0].is_var() &&
         call_ir->read_args[0].as_var()->type().is_cpp_handle());
   CHECK(call_ir->read_args[1].type().is_int(32));
@@ -251,10 +271,22 @@ llvm::Value* CodeGenCUDA_Host::LowerCUDAKernelCall(const ir::Call* call_ir) {
   llvm::Value* kernel_stream = nullptr;
   if (ll_function_args.size() == 3) {
     kernel_stream = ll_function_args[2];
-    CHECK_EQ(kernel_stream->getType(), ll_void_p_ty());  // void* stream
+    PADDLE_ENFORCE_EQ(
+        kernel_stream->getType(),
+        ll_void_p_ty(),
+        phi::errors::InvalidArgument(
+            "The type of kernel_stream should be void*"));  // void* stream
   }
-  CHECK_EQ(kernel_args->getType(), ll_void_p_ty());       // void* args
-  CHECK_EQ(kernel_args_count->getType(), ll_int32_ty());  // int32
+  PADDLE_ENFORCE_EQ(
+      kernel_args->getType(),
+      ll_void_p_ty(),
+      phi::errors::InvalidArgument(
+          "The type of kernel_args should be void*"));  // void* args
+  PADDLE_ENFORCE_EQ(
+      kernel_args_count->getType(),
+      ll_int32_ty(),
+      phi::errors::InvalidArgument(
+          "The type of kernel_args_count should be int32"));  // int32
 
   std::unordered_map<std::string, llvm::Value*> global_args = {
       {KERNEL_ARGS, kernel_args},
diff --git a/paddle/cinn/backends/codegen_device_util.cc b/paddle/cinn/backends/codegen_device_util.cc
index 3373ed15e3bec..91c18ea35e9ea 100644
--- a/paddle/cinn/backends/codegen_device_util.cc
+++ b/paddle/cinn/backends/codegen_device_util.cc
@@ -68,6 +68,18 @@ std::string Predicate2String(ir::Expr predicate) {
   return ss.str();
 }
 
+static std::string CurTailFnName(const std::string &origin_fn_name) {
+  const int MaxStrLength = 16383;
+  if (origin_fn_name.length() <= MaxStrLength) {
+    return origin_fn_name;
+  }
+  VLOG(6) << "Funtion name too long. Curtail and concat hash.";
+  const std::string new_fn_name =
+      origin_fn_name.substr(0, MaxStrLength) +
+      std::to_string(std::hash<std::string>()(origin_fn_name));
+  return new_fn_name;
+}
+
 std::string
 detail::CollectBucketStrategyHostFunctionVisitor::GenDeviceKernelName(
     const std::string &fn_name, ir::Expr predicate) {
@@ -80,7 +92,10 @@ detail::CollectBucketStrategyHostFunctionVisitor::GenDeviceKernelName(
     pos = cond_str.find("-", pos + replacement.length());
   }
   VLOG(3) << "predicate string: " << cond_str;
-  return fn_name + "__COND_" + cond_str + "__kernel";
+  // NOTE(chenxi67): The kernel name is too long to be supported in cuda12.3 so
+  // we need to curtail it.
+  const std::string new_fn_name = CurTailFnName(fn_name);
+  return new_fn_name + "__COND_" + cond_str + "__kernel";
 }
 
 void detail::CollectBucketStrategyHostFunctionVisitor::ProcessLoweredFunc(
diff --git a/paddle/cinn/backends/codegen_device_util.h b/paddle/cinn/backends/codegen_device_util.h
index caada3153e63b..ff3114c71296b 100644
--- a/paddle/cinn/backends/codegen_device_util.h
+++ b/paddle/cinn/backends/codegen_device_util.h
@@ -27,7 +27,7 @@
 #include "paddle/cinn/ir/ir_mutator.h"
 #include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/runtime/flags.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace backends {
 
@@ -205,7 +205,11 @@ struct CollectBucketStrategyHostFunctionVisitor
     if (op->functions.size() == 1 && op->predicates.size() == 0) {
       expr->as_module()->predicates.push_back(ir::Expr(true));
     }
-    CHECK_EQ(op->functions.size(), op->predicates.size());
+    PADDLE_ENFORCE_EQ(
+        op->functions.size(),
+        op->predicates.size(),
+        phi::errors::InvalidArgument(
+            "The size of functions and predicates should be equal"));
     for (int i = 0; i < op->functions.size(); ++i) {
       ProcessLoweredFunc(op->functions[i], op->predicates[i]);
       if (i == 0) {
diff --git a/paddle/cinn/backends/compiler.cc b/paddle/cinn/backends/compiler.cc
index 4f02a35411413..72678eec44c22 100644
--- a/paddle/cinn/backends/compiler.cc
+++ b/paddle/cinn/backends/compiler.cc
@@ -230,15 +230,23 @@ void SourceCodePrint::write(const std::string& source_code) {
   }
 }
 
-void Compiler::Build(const Module& module, const std::string& code) {
-  auto PatternMatch =
-      adt::match{[&](common::UnknownArch) { CINN_NOT_IMPLEMENTED; },
-                 [&](common::X86Arch) { CompileX86Module(module); },
-                 [&](common::ARMArch) { CINN_NOT_IMPLEMENTED; },
-                 [&](common::NVGPUArch) { CompileCudaModule(module, code); }};
+void Compiler::Build(const Module& module,
+                     const std::string& code,
+                     const bool end) {
+  auto PatternMatch = adt::match{
+      [&](common::UnknownArch) { CINN_NOT_IMPLEMENTED; },
+      [&](common::X86Arch) { CompileX86Module(module, end); },
+      [&](common::ARMArch) { CINN_NOT_IMPLEMENTED; },
+      [&](common::NVGPUArch) { CompileCudaModule(module, code, end); }};
   return std::visit(PatternMatch, target_.arch.variant());
 }
 
+void Compiler::AppendCX86(const Module& module) {
+  VLOG(3) << "Start Compiler::BuildCX86" << module;
+  CompileX86Module(module, true);
+  VLOG(3) << "Over Compiler::BuildCX86";
+}
+
 std::string Compiler::GetSourceCode(const ir::Module& module) {
   return target_.arch.Visit(adt::match{
       [&](common::UnknownArch) -> std::string { CINN_NOT_IMPLEMENTED; },
@@ -287,7 +295,8 @@ std::string GetFileContent(const std::string& path) {
 }  // namespace
 
 void Compiler::CompileCudaModule(const Module& module,
-                                 const std::string& code) {
+                                 const std::string& code,
+                                 bool add_module) {
 #ifdef CINN_WITH_CUDA
   auto _host_module_device_module_ =
       SplitDeviceAndHostModule(module);  // NOLINT
@@ -337,15 +346,15 @@ void Compiler::CompileCudaModule(const Module& module,
   }
 
   engine_ = ExecutionEngine::Create(ExecutionOptions(), std::move(symbols));
-  engine_->Link<CodeGenCUDA_Host>(host_module);
+  engine_->Link<CodeGenCUDA_Host>(host_module, add_module);
 
 #else
   CINN_NOT_IMPLEMENTED
 #endif
 }
 
-void Compiler::CompileX86Module(const Module& module) {
-  engine_->Link<CodeGenX86>(module);
+void Compiler::CompileX86Module(const Module& module, bool add_module) {
+  engine_->Link<CodeGenX86>(module, add_module);
 }
 
 void Compiler::ExportObject(const std::string& path) {
diff --git a/paddle/cinn/backends/compiler.h b/paddle/cinn/backends/compiler.h
index f269b00492a42..d43455cf76287 100644
--- a/paddle/cinn/backends/compiler.h
+++ b/paddle/cinn/backends/compiler.h
@@ -107,7 +107,10 @@ class Compiler final {
   /**
    * Compile and link to a CINN module.
    */
-  void Build(const ir::Module& module, const std::string& code = "");
+  void Build(const ir::Module& module,
+             const std::string& code = "",
+             const bool end = true);
+  void AppendCX86(const ir::Module& module);
 
   void ExportObject(const std::string& path);
 
@@ -125,9 +128,10 @@ class Compiler final {
 
  private:
   void CompileCudaModule(const ir::Module& module,
-                         const std::string& code = "");
+                         const std::string& code = "",
+                         bool add_module = true);
 
-  void CompileX86Module(const ir::Module& module);
+  void CompileX86Module(const ir::Module& module, bool add_module = true);
 
   explicit Compiler(const Target& target)
       : target_(target), engine_(ExecutionEngine::Create(ExecutionOptions())) {}
diff --git a/paddle/cinn/backends/function_prototype.cc b/paddle/cinn/backends/function_prototype.cc
index e413521246b8f..e46b172bf65ed 100644
--- a/paddle/cinn/backends/function_prototype.cc
+++ b/paddle/cinn/backends/function_prototype.cc
@@ -20,7 +20,7 @@
 
 #include "paddle/cinn/ir/tensor.h"
 #include "paddle/cinn/runtime/flags.h"
-
+#include "paddle/common/enforce.h"
 PD_DECLARE_bool(verbose_function_register);
 
 namespace cinn {
@@ -42,13 +42,22 @@ bool FunctionProto::Match(const ir::Call *op) const {
 }
 
 void FunctionProto::AssertMatch(const ir::Call *op) const {
-  CHECK_EQ(name, op->name);
-  CHECK_EQ(ret_type, op->type())
-      << "function proto " << name << " check failed";
-  CHECK_EQ(op->read_args.size(), readonly_arg_types.size())
-      << "function proto " << name << " check failed";
-  CHECK_EQ(op->write_args.size(), mutable_arg_types.size())
-      << "function proto " << name << " check failed";
+  PADDLE_ENFORCE_EQ(
+      name,
+      op->name,
+      phi::errors::InvalidArgument("function proto's op name check failed"));
+  PADDLE_ENFORCE_EQ(
+      ret_type,
+      op->type(),
+      phi::errors::InvalidArgument("function proto's op type check failed"));
+  PADDLE_ENFORCE_EQ(op->read_args.size(),
+                    readonly_arg_types.size(),
+                    phi::errors::InvalidArgument(
+                        "function proto's readonly arg types check failed"));
+  PADDLE_ENFORCE_EQ(op->write_args.size(),
+                    mutable_arg_types.size(),
+                    phi::errors::InvalidArgument(
+                        "function proto's mutable arg types check failed"));
 
   auto get_type = [](Expr u) {
     if (u.as_tensor() || u.as_buffer()) {
@@ -61,14 +70,21 @@ void FunctionProto::AssertMatch(const ir::Call *op) const {
     if (readonly_arg_types[i] == type_of<cinn_buffer_t *>()) {
       if (!op->read_args[i].as_tensor()) continue;
     } else {
-      CHECK_EQ(get_type(op->read_args[i]), readonly_arg_types[i]);
+      PADDLE_ENFORCE_EQ(
+          get_type(op->read_args[i]),
+          readonly_arg_types[i],
+          phi::errors::InvalidArgument(
+              "function proto's readonly arg types check failed"));
     }
   }
   for (int i = 0; i < op->write_args.size(); i++) {
     if (mutable_arg_types[i] == type_of<cinn_buffer_t *>()) {
       if (!op->write_args[i].as_tensor()) continue;
     } else {
-      CHECK_EQ(get_type(op->write_args[i]), mutable_arg_types[i]);
+      PADDLE_ENFORCE_EQ(get_type(op->write_args[i]),
+                        mutable_arg_types[i],
+                        phi::errors::InvalidArgument(
+                            "function proto's mutable arg types check failed"));
     }
   }
 }
@@ -86,7 +102,10 @@ void FunctionProto::CheckValid() {
 
 FunctionProto::shape_inference_t FunctionProto::ShapeFollowNthArgument(int n) {
   return [=](const std::vector<Expr> &args, int value_offset) {
-    CHECK_LT(n, args.size());
+    PADDLE_ENFORCE_LT(
+        n,
+        args.size(),
+        phi::errors::InvalidArgument("The argument index is out of range"));
     auto x = args[n].as_tensor();
     CHECK(x);
     return x->shape;
diff --git a/paddle/cinn/backends/ir_schedule_test.cc b/paddle/cinn/backends/ir_schedule_test.cc
index 29eae201bbb78..7dd78ddb9cd86 100644
--- a/paddle/cinn/backends/ir_schedule_test.cc
+++ b/paddle/cinn/backends/ir_schedule_test.cc
@@ -31,7 +31,7 @@
 #include "paddle/cinn/optim/remove_schedule_block.h"
 #include "paddle/cinn/optim/unroll_loops.h"
 #include "paddle/cinn/optim/vectorize_loops.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace backends {
 
@@ -563,7 +563,10 @@ TEST(IrSchedule, vectorize) {
   ir::ModuleExpr mod_expr(vec_ast);
   ir::IRSchedule ir_sch(mod_expr);
   auto loops = ir_sch.GetLoops("B");
-  CHECK_EQ(loops.size(), 2U);
+  PADDLE_ENFORCE_EQ(
+      loops.size(),
+      2U,
+      phi::errors::InvalidArgument("The size of loops should be 2."));
   ir_sch.Vectorize(loops[1], 16);
   std::string origin = utils::GetStreamCnt(func[0]);
   EXPECT_EQ(origin, utils::Trim(R"ROC(
@@ -637,7 +640,10 @@ TEST(IrSchedule, unroll) {
   ir::ModuleExpr mod_expr(vec_ast);
   ir::IRSchedule ir_sch(mod_expr);
   auto loops = ir_sch.GetLoops("B");
-  CHECK_EQ(loops.size(), 2U);
+  PADDLE_ENFORCE_EQ(
+      loops.size(),
+      2U,
+      phi::errors::InvalidArgument("The size of loops should be 2."));
   ir_sch.Unroll(loops[1]);
   std::string origin = utils::GetStreamCnt(func[0]);
   EXPECT_EQ(origin, utils::Trim(R"ROC(
@@ -711,7 +717,10 @@ TEST(IrSchedule, bind) {
   ir::ModuleExpr mod_expr(vec_ast);
   ir::IRSchedule ir_sch(mod_expr);
   auto loops = ir_sch.GetLoops("B");
-  CHECK_EQ(loops.size(), 2U);
+  PADDLE_ENFORCE_EQ(
+      loops.size(),
+      2U,
+      phi::errors::InvalidArgument("The size of loops should be 2."));
   ir_sch.Bind(loops[0], "blockIdx.x");
   std::string origin = utils::GetStreamCnt(func[0]);
   EXPECT_EQ(origin, utils::Trim(R"ROC(
@@ -753,7 +762,10 @@ TEST(IrSchedule, simple_compute_at) {
 
   auto func = cinn::lang::LowerVec(
       "test_simple_compute_at", stages, {A, C}, {}, {}, nullptr, target, true);
-  CHECK_EQ(func.size(), 1U);
+  PADDLE_ENFORCE_EQ(
+      func.size(),
+      1U,
+      phi::errors::InvalidArgument("The size of func should be 1."));
 
   auto ast_expr = func[0]->body;
   std::vector<Expr> vec_ast{ast_expr};
@@ -826,7 +838,10 @@ TEST(IrSchedule, compute_at0) {
 
   auto func = cinn::lang::LowerVec(
       "test_compute_at0", stages, {A, C}, {}, {}, nullptr, target, true);
-  CHECK_EQ(func.size(), 1U);
+  PADDLE_ENFORCE_EQ(
+      func.size(),
+      1U,
+      phi::errors::InvalidArgument("The size of func should be 1."));
 
   auto ast_expr = func[0]->body;
   std::vector<Expr> vec_ast{ast_expr};
@@ -900,7 +915,10 @@ TEST(IrSchedule, compute_at1) {
 
   auto func = cinn::lang::LowerVec(
       "test_compute_at1", stages, {A, C}, {}, {}, nullptr, target, true);
-  CHECK_EQ(func.size(), 1U);
+  PADDLE_ENFORCE_EQ(
+      func.size(),
+      1U,
+      phi::errors::InvalidArgument("The size of func should be 1."));
 
   auto ast_expr = func[0]->body;
   std::vector<Expr> vec_ast{ast_expr};
@@ -972,7 +990,10 @@ TEST(IrSchedule, compute_at2) {
 
   auto func = cinn::lang::LowerVec(
       "test_compute_at2", stages, {A, C}, {}, {}, nullptr, target, true);
-  CHECK_EQ(func.size(), 1U);
+  PADDLE_ENFORCE_EQ(
+      func.size(),
+      1U,
+      phi::errors::InvalidArgument("The size of func should be 1."));
 
   auto ast_expr = func[0]->body;
   std::vector<Expr> vec_ast{ast_expr};
@@ -1044,7 +1065,10 @@ TEST(IrSchedule, compute_at3) {
 
   auto func = cinn::lang::LowerVec(
       "test_compute_at3", stages, {A, C}, {}, {}, nullptr, target, true);
-  CHECK_EQ(func.size(), 1U);
+  PADDLE_ENFORCE_EQ(
+      func.size(),
+      1U,
+      phi::errors::InvalidArgument("The size of func should be 1."));
 
   auto ast_expr = func[0]->body;
   std::vector<Expr> vec_ast{ast_expr};
@@ -1125,7 +1149,10 @@ TEST(IrSchedule, compute_at4) {
 
   auto func = cinn::lang::LowerVec(
       "test_compute_at4", stages, {A, C}, {}, {}, nullptr, target, true);
-  CHECK_EQ(func.size(), 1U);
+  PADDLE_ENFORCE_EQ(
+      func.size(),
+      1U,
+      phi::errors::InvalidArgument("The size of func should be 1."));
 
   auto ast_expr = func[0]->body;
   std::vector<Expr> vec_ast{ast_expr};
@@ -1187,7 +1214,10 @@ TEST(IrSchedule, compute_at5) {
 
   auto func = cinn::lang::LowerVec(
       "test_compute_at5", stages, {A, C}, {}, {}, nullptr, target, true);
-  CHECK_EQ(func.size(), 1U);
+  PADDLE_ENFORCE_EQ(
+      func.size(),
+      1U,
+      phi::errors::InvalidArgument("The size of func should be 1."));
 
   auto ast_expr = func[0]->body;
   std::vector<Expr> vec_ast{ast_expr};
@@ -1250,7 +1280,10 @@ TEST(IrSchedule, compute_at6) {
 
   auto func = cinn::lang::LowerVec(
       "test_compute_at6", stages, {A, C}, {}, {}, nullptr, target, true);
-  CHECK_EQ(func.size(), 1U);
+  PADDLE_ENFORCE_EQ(
+      func.size(),
+      1U,
+      phi::errors::InvalidArgument("The size of func should be 1."));
 
   auto ast_expr = func[0]->body;
   std::vector<Expr> vec_ast{ast_expr};
@@ -1316,7 +1349,10 @@ TEST(IrSchedule, cache_read1) {
   auto func = cinn::lang::LowerVec(
       "test_cache_read1", stages, {A, C}, {}, {}, nullptr, target, true);
 
-  CHECK_EQ(func.size(), 1U);
+  PADDLE_ENFORCE_EQ(
+      func.size(),
+      1U,
+      phi::errors::InvalidArgument("The size of func should be 1."));
 
   auto ast_expr = func[0]->body;
   std::vector<Expr> vec_ast{ast_expr};
@@ -1399,7 +1435,10 @@ TEST(IrSchedule, cache_read2) {
   auto func = cinn::lang::LowerVec(
       "test_cache_read2", stages, {A, B}, {}, {}, nullptr, target, true);
 
-  CHECK_EQ(func.size(), 1U);
+  PADDLE_ENFORCE_EQ(
+      func.size(),
+      1U,
+      phi::errors::InvalidArgument("The size of func should be 1."));
 
   auto ast_expr = func[0]->body;
   std::vector<Expr> vec_ast{ast_expr};
@@ -1469,7 +1508,10 @@ TEST(IrSchedule, cache_write1) {
   auto func = cinn::lang::LowerVec(
       "test_cache_write1", stages, {A, C}, {}, {}, nullptr, target, true);
 
-  CHECK_EQ(func.size(), 1U);
+  PADDLE_ENFORCE_EQ(
+      func.size(),
+      1U,
+      phi::errors::InvalidArgument("The size of func should be 1."));
 
   auto ast_expr = func[0]->body;
   std::vector<Expr> vec_ast{ast_expr};
@@ -1553,7 +1595,10 @@ TEST(IrSchedule, cache_write2) {
   auto func = cinn::lang::LowerVec(
       "test_cache_write2", stages, {A, B}, {}, {}, nullptr, target, true);
 
-  CHECK_EQ(func.size(), 1U);
+  PADDLE_ENFORCE_EQ(
+      func.size(),
+      1U,
+      phi::errors::InvalidArgument("The size of func should be 1."));
 
   auto ast_expr = func[0]->body;
   std::vector<Expr> vec_ast{ast_expr};
@@ -1624,7 +1669,10 @@ TEST(IrSchedule, cache_read3) {
   auto func = cinn::lang::LowerVec(
       "test_cache_read3", stages, {A, C}, {}, {}, nullptr, target, true);
 
-  CHECK_EQ(func.size(), 1U);
+  PADDLE_ENFORCE_EQ(
+      func.size(),
+      1U,
+      phi::errors::InvalidArgument("The size of func should be 1."));
 
   auto ast_expr = func[0]->body;
   std::vector<Expr> vec_ast{ast_expr};
@@ -1705,7 +1753,10 @@ TEST(IrSchedule, cache_write3) {
   auto func = cinn::lang::LowerVec(
       "test_cache_write3", stages, {A, C}, {}, {}, nullptr, target, true);
 
-  CHECK_EQ(func.size(), 1U);
+  PADDLE_ENFORCE_EQ(
+      func.size(),
+      1U,
+      phi::errors::InvalidArgument("The size of func should be 1."));
 
   auto ast_expr = func[0]->body;
   std::vector<Expr> vec_ast{ast_expr};
@@ -1788,7 +1839,10 @@ TEST(IrSchedule, sync_threads) {
   auto func = cinn::lang::LowerVec(
       "test_sync_threads", stages, {A, C}, {}, {}, nullptr, target, true);
 
-  CHECK_EQ(func.size(), 1U);
+  PADDLE_ENFORCE_EQ(
+      func.size(),
+      1U,
+      phi::errors::InvalidArgument("The size of func should be 1."));
 
   auto ast_expr = func[0]->body;
   std::vector<Expr> vec_ast{ast_expr};
@@ -1870,7 +1924,10 @@ TEST(IrSchedule, cache_write4) {
   auto func = cinn::lang::LowerVec(
       "test_cache_write4", stages, {A, B}, {}, {}, nullptr, target, true);
 
-  CHECK_EQ(func.size(), 1U);
+  PADDLE_ENFORCE_EQ(
+      func.size(),
+      1U,
+      phi::errors::InvalidArgument("The size of func should be 1."));
 
   auto ast_expr = func[0]->body;
   std::vector<Expr> vec_ast{ast_expr};
@@ -1953,7 +2010,10 @@ TEST(IrSchedule, rfactor) {
   ir::ModuleExpr mod_expr(vec_ast);
   ir::IRSchedule ir_sch(mod_expr);
   auto loops = ir_sch.GetLoops("B");
-  CHECK_EQ(loops.size(), 3U);
+  PADDLE_ENFORCE_EQ(
+      loops.size(),
+      3U,
+      phi::errors::InvalidArgument("The size of loops should be 3."));
   auto new_rf_tensor = ir_sch.Rfactor(loops[2], 0);
   auto* new_rf_tensor_ref = new_rf_tensor.As<ir::_Tensor_>();
   CHECK(new_rf_tensor_ref);
@@ -2080,7 +2140,10 @@ TEST(IrSchedule, rfactor1) {
   ir::ModuleExpr mod_expr(vec_ast);
   ir::IRSchedule ir_sch(mod_expr);
   auto loops = ir_sch.GetLoops("B");
-  CHECK_EQ(loops.size(), 3U);
+  PADDLE_ENFORCE_EQ(
+      loops.size(),
+      3U,
+      phi::errors::InvalidArgument("The size of loops should be 3."));
   auto new_rf_tensor = ir_sch.Rfactor(loops[1], 1);
   auto* new_rf_tensor_ref = new_rf_tensor.As<ir::_Tensor_>();
   CHECK(new_rf_tensor_ref);
@@ -2206,7 +2269,10 @@ TEST(IrSchedule, rfactor2) {
   ir::ModuleExpr mod_expr(vec_ast);
   ir::IRSchedule ir_sch(mod_expr);
   auto loops = ir_sch.GetLoops("C");
-  CHECK_EQ(loops.size(), 3U);
+  PADDLE_ENFORCE_EQ(
+      loops.size(),
+      3U,
+      phi::errors::InvalidArgument("The size of loops should be 3."));
   auto new_rf_tensor = ir_sch.Rfactor(loops[2], 0);
   auto* new_rf_tensor_ref = new_rf_tensor.As<ir::_Tensor_>();
   CHECK(new_rf_tensor_ref);
@@ -2347,7 +2413,10 @@ TEST(IrSchedule, factorize_reduction) {
   ir::ModuleExpr mod_expr(vec_ast);
   ir::IRSchedule ir_sch(mod_expr);
   auto loops = ir_sch.GetLoops("B");
-  CHECK_EQ(loops.size(), 3U);
+  PADDLE_ENFORCE_EQ(
+      loops.size(),
+      3U,
+      phi::errors::InvalidArgument("The size of loops should be 3."));
   auto new_rf_tensor = ir_sch.FactorizeReduction(loops[1], 0);
   auto* new_rf_tensor_ref = new_rf_tensor.As<ir::_Tensor_>();
   CHECK(new_rf_tensor_ref);
@@ -2436,7 +2505,10 @@ TEST(IrSchedule, factorize_reduction1) {
   ir::ModuleExpr mod_expr(vec_ast);
   ir::IRSchedule ir_sch(mod_expr);
   auto loops = ir_sch.GetLoops("B");
-  CHECK_EQ(loops.size(), 3U);
+  PADDLE_ENFORCE_EQ(
+      loops.size(),
+      3U,
+      phi::errors::InvalidArgument("The size of loops should be 3."));
   auto new_rf_tensor = ir_sch.FactorizeReduction(loops[1], 1);
   auto* new_rf_tensor_ref = new_rf_tensor.As<ir::_Tensor_>();
   CHECK(new_rf_tensor_ref);
@@ -2520,9 +2592,15 @@ TEST(IrSchedule, factorize_reduction2) {
   ir::ModuleExpr mod_expr(vec_ast);
   ir::IRSchedule ir_sch(mod_expr);
   auto loops = ir_sch.GetLoops("B");
-  CHECK_EQ(loops.size(), 2U);
+  PADDLE_ENFORCE_EQ(
+      loops.size(),
+      2U,
+      phi::errors::InvalidArgument("The size of loops should be 2."));
   auto splited_loops = ir_sch.Split(loops[1], {4, 5});
-  CHECK_EQ(splited_loops.size(), 2U);
+  PADDLE_ENFORCE_EQ(
+      splited_loops.size(),
+      2U,
+      phi::errors::InvalidArgument("The size of splited_loops should be 2."));
   auto new_rf_tensor = ir_sch.FactorizeReduction(splited_loops[0], 1);
   auto* new_rf_tensor_ref = new_rf_tensor.As<ir::_Tensor_>();
   CHECK(new_rf_tensor_ref);
@@ -3278,13 +3356,19 @@ TEST(IrSchedule, ComplexIndices) {
   VLOG(3) << "Lowered Expr:" << ir_sch.GetModule().GetExprs().front();
 
   auto loops_b = ir_sch.GetLoops("B");
-  CHECK_EQ(loops_b.size(), 2);
+  PADDLE_ENFORCE_EQ(
+      loops_b.size(),
+      2,
+      phi::errors::InvalidArgument("The loops size of B should be 2."));
   ir_sch.Split("B", 0, {8, -1});
   ir_sch.Split(
       "B", 2, {32, -1});  // after first splited, loops size has added to 3
   VLOG(3) << "Splited Expr:" << ir_sch.GetModule().GetExprs().front();
 
-  CHECK_EQ(ir_sch.GetLoops("B").size(), 4);
+  PADDLE_ENFORCE_EQ(ir_sch.GetLoops("B").size(),
+                    4,
+                    phi::errors::InvalidArgument(
+                        "The loops size of B should be 4 after split."));
   ir_sch.Reorder("B", {2, 0, 3, 1});
   VLOG(3) << "Reordered Expr:\n" << ir_sch.GetModule().GetExprs().front();
 
diff --git a/paddle/cinn/backends/llvm/codegen_llvm.cc b/paddle/cinn/backends/llvm/codegen_llvm.cc
index 2f8a387045bf6..d7889ebb9fc15 100644
--- a/paddle/cinn/backends/llvm/codegen_llvm.cc
+++ b/paddle/cinn/backends/llvm/codegen_llvm.cc
@@ -24,7 +24,6 @@
 #include <llvm/IR/Metadata.h>
 #include <llvm/Support/TargetSelect.h>
 #include <llvm/Support/raw_ostream.h>
-
 #include <algorithm>
 #include <functional>
 #include <iostream>
@@ -32,6 +31,7 @@
 #include <sstream>
 #include <string>
 #include <type_traits>
+#include "paddle/common/enforce.h"
 
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
@@ -205,12 +205,12 @@ llvm::Value *CodeGenLLVM::EmitBinaryOp(llvm::Value *lhs,
                                        bool is_integral,
                                        bool is_signed) {
   llvm::Instruction::BinaryOps ops;
-  CHECK_EQ(lhs->getType(), rhs->getType())
-      << "the types of operands of binary operation are mismatch"
-      << ", lhs[" << DumpToString(*lhs) << "] " << opcode << " rhs["
-      << DumpToString(*rhs) << "]"
-      << ", lhs_type[" << DumpToString(*lhs->getType()) << "], rhs_type["
-      << DumpToString(*rhs->getType()) << "]";
+  PADDLE_ENFORCE_EQ(
+      lhs->getType(),
+      rhs->getType(),
+      phi::errors::InvalidArgument(
+          "the types of operands of binary operation are mismatch"));
+
   switch (opcode) {
     case '+':
       ops = is_integral ? llvm::Instruction::BinaryOps::Add
@@ -288,6 +288,7 @@ llvm::Value *CodeGenLLVM::Visit(const ir::Sub *op) {
 }
 
 llvm::Value *CodeGenLLVM::Visit(const ir::Mul *op) {
+  ir::TryElevateInt32ToInt64({op->a(), op->b()});
   auto *lhs = Visit(&op->a());
   auto *rhs = Visit(&op->b());
   return EmitBinaryOp(lhs, rhs, '*', is_integral_type(op->type()));
@@ -591,8 +592,8 @@ llvm::Value *CodeGenLLVM::CreateSerialFor(const ir::For *op, int stride) {
 
   llvm::Value *old_var = GetVar(op->loop_var->name);
   // loop iterator
-  llvm::AllocaInst *loop_var =
-      Alloca(b_->getInt32Ty(), nullptr, op->loop_var->name);
+  llvm::AllocaInst *loop_var = Alloca(
+      b_->getIntNTy(op->min->type().bits()), nullptr, op->loop_var->name);
   loop_var->setAlignment(llvm::Align(4));
   SetVar(op->loop_var->name, loop_var);
 
@@ -613,7 +614,8 @@ llvm::Value *CodeGenLLVM::CreateSerialFor(const ir::For *op, int stride) {
 
   // loop_body
   b_->SetInsertPoint(body_bb);
-  llvm::Value *step = llvm::ConstantInt::get(b_->getInt32Ty(), stride);
+  llvm::Value *step =
+      llvm::ConstantInt::get(b_->getIntNTy(op->min->type().bits()), stride);
 
   Visit(&op->body);
   llvm::Value *indvar_inc = Add(indvar,
@@ -880,7 +882,10 @@ llvm::Value *CodeGenLLVM::Visit(const ir::Load *op) {
     {
       int alignment = op->type().bits();
       alignment = 8;
-      CHECK_GT(alignment, 0);
+      PADDLE_ENFORCE_GT(
+          alignment,
+          0,
+          phi::errors::InvalidArgument("alignment should be greater than 0"));
       load_inst->setAlignment(llvm::Align(std::min(alignment, 8)));
     }
 
@@ -949,7 +954,10 @@ llvm::Value *CodeGenLLVM::Visit(const ir::Store *op) {
     {
       int alignment = op->type().bits();
       alignment = 8;
-      CHECK_GT(alignment, 0);
+      PADDLE_ENFORCE_GT(
+          alignment,
+          0,
+          phi::errors::InvalidArgument("alignment should be greater than 0"));
       store_inst->setAlignment(llvm::Align(std::min(alignment, 8)));
     }
     // TODO(fc500110): tbaa AliasAnalysis
@@ -1059,9 +1067,12 @@ llvm::Value *CodeGenLLVM::Visit(const ir::_LoweredFunc_ *op) {
   auto init_function_state = [this]() { alias_vars_.clear(); };
   init_function_state();
 
-  CHECK_EQ(op->alloc_output_buffer_exprs.size(),
-           op->dealloc_output_buffer_exprs.size())
-      << "the count of allocation and deallocation expressions is not match";
+  PADDLE_ENFORCE_EQ(
+      op->alloc_output_buffer_exprs.size(),
+      op->dealloc_output_buffer_exprs.size(),
+      phi::errors::InvalidArgument(
+          "the count of allocation and deallocation expressions is not "
+          "match"));
 
   std::vector<Expr> new_body;
   auto create_temp_buffers = op->PrepareCreateTempBufferExprs();
@@ -1228,7 +1239,11 @@ llvm::Value *CodeGenLLVM::EmitCall_get_address(const ir::Call *op) {
 
 llvm::Value *CodeGenLLVM::EmitCall_debug_info(const ir::Call *op) {
   auto callee = m_->getFunction(runtime::intrinsic::debug_log_repr);
-  CHECK_GE(op->read_args.size(), 1UL);
+  PADDLE_ENFORCE_GE(op->read_args.size(),
+                    1UL,
+                    phi::errors::InvalidArgument(
+                        "The arguments of debug_log_repr should be greater "
+                        "than 1"));
   std::vector<llvm::Value *> args;
   for (auto &arg : op->read_args) {
     args.push_back(Visit(&arg));
@@ -1315,7 +1330,9 @@ llvm::Value *CodeGenLLVM::DenseVectorLoad(const ir::Load *op) {
     slices.push_back(load_inst);
   }
 
-  CHECK_EQ(slices.size(), 1UL);
+  PADDLE_ENFORCE_EQ(slices.size(),
+                    1UL,
+                    phi::errors::InvalidArgument("slices size should be 1."));
 
   return slices[0];
 }
@@ -1323,7 +1340,11 @@ llvm::Value *CodeGenLLVM::DenseVectorLoad(const ir::Load *op) {
 llvm::Value *CodeGenLLVM::CreateBufferVecPtr(Type t,
                                              llvm::Value *buffer,
                                              llvm::Value *index) {
-  CHECK_GT(t.lanes(), 1) << "type is not a vector type: " << t;
+  PADDLE_ENFORCE_GT(t.lanes(),
+                    1,
+                    phi::errors::InvalidArgument("type lanes should be greater "
+                                                 "than 1, but received %d",
+                                                 t.lanes()));
   llvm::PointerType *btype =
       llvm::dyn_cast<llvm::PointerType>(buffer->getType());
   CHECK(btype);
@@ -1338,7 +1359,11 @@ llvm::Value *CodeGenLLVM::CreateBufferVecPtr(Type t,
 llvm::Value *CodeGenLLVM::CreateBufferPtr(Type t,
                                           llvm::Value *buffer,
                                           llvm::Value *index) {
-  CHECK_EQ(t.lanes(), 1);
+  PADDLE_ENFORCE_EQ(t.lanes(),
+                    1,
+                    phi::errors::InvalidArgument("type lanes should be 1, but "
+                                                 "received %d",
+                                                 t.lanes()));
   auto *btype = llvm::dyn_cast<llvm::PointerType>(buffer->getType());
   CHECK(btype);
   auto *ptype =
@@ -1355,7 +1380,10 @@ llvm::Value *CodeGenLLVM::CreateVecSlice(llvm::Value *vec,
                                          int lanes) {
   int total_lanes =
       llvm::dyn_cast<llvm::VectorType>(vec->getType())->getNumElements();
-  CHECK_LE(begin + lanes, total_lanes);
+  PADDLE_ENFORCE_LE(begin + lanes,
+                    total_lanes,
+                    phi::errors::InvalidArgument(
+                        "begin + lanes should be less than total_lanes"));
   if (lanes == total_lanes && begin == 0) return vec;  // full slice
   std::vector<llvm::Constant *> indices;
   for (int i = 0; i < lanes; ++i) {
@@ -1422,7 +1450,10 @@ void CodeGenLLVM::AddTbaaMetadata(llvm::Instruction *inst,
       if (pstride_int && pbase_int) {
         int stride = pstride_int->value;
         base = pbase_int->value;
-        CHECK_GE(base, 0);
+        PADDLE_ENFORCE_GE(
+            base,
+            0,
+            phi::errors::InvalidArgument("base should be greater than 0"));
         width = NextPowerOfTwo(ramp->lanes * stride);
 
         while (base % width) {
@@ -1491,12 +1522,15 @@ llvm::Value *CodeGenLLVM::Visit(const ir::intrinsics::BufferCreate *op) {
   CHECK(buffer_node);
   std::vector<llvm::Value *> args(
       {ll_const_int32(buffer_node->target.runtime_arch())});
-  uint64_t memory_size = (buffer_node->dtype.ElementOf().bits() + 7) / 8;
-  for (auto shape : buffer_node->shape) {
-    int shape_int = shape.as_int32();
-    memory_size *= shape_int;
+  int64_t memory_size = (buffer_node->dtype.ElementOf().bits() + 7) / 8;
+  // Calculate buffer size and determine if it contains a symbolic constant
+  Expr buffer_size(static_cast<int64_t>(1));
+  buffer_size = buffer_size * ir::Expr(memory_size);
+  for (int i = 0; i < buffer_node->shape.size(); i++) {
+    buffer_size = buffer_size * buffer_node->shape[i];
   }
-  args.push_back(ll_const_int64(memory_size));
+  ir::TryElevateInt32ToInt64({buffer_size});
+  args.push_back(Visit(&buffer_size));
   args.push_back(ll_const_int32(32));
 
   return Call(callee, args);
@@ -1596,29 +1630,50 @@ llvm::Value *CodeGenLLVM::Visit(const ir::intrinsics::BuiltinIntrin *op) {
   std::string func_name = op->name;
   if (op->id == -1) {
     if (func_name == "bitwise_and") {
-      CHECK_GE(op->args.size(), 2U);
+      PADDLE_ENFORCE_GE(op->args.size(),
+                        2U,
+                        phi::errors::InvalidArgument(
+                            "bitwise_and should have at least 2 arguments"));
       return b_->CreateAnd(Visit(&op->args[0]), Visit(&op->args[1]));
     } else if (func_name == "bitwise_or") {
-      CHECK_GE(op->args.size(), 2U);
+      PADDLE_ENFORCE_GE(op->args.size(),
+                        2U,
+                        phi::errors::InvalidArgument(
+                            "bitwise_or should have at least 2 arguments"));
       return b_->CreateOr(Visit(&op->args[0]), Visit(&op->args[1]));
     } else if (func_name == "bitwise_xor") {
-      CHECK_GE(op->args.size(), 2U);
+      PADDLE_ENFORCE_GE(op->args.size(),
+                        2U,
+                        phi::errors::InvalidArgument(
+                            "bitwise_xor should have at least 2 arguments"));
       return b_->CreateXor(Visit(&op->args[0]), Visit(&op->args[1]));
     } else if (func_name == "bitwise_not") {
-      CHECK_GE(op->args.size(), 1U);
+      PADDLE_ENFORCE_GE(op->args.size(),
+                        1U,
+                        phi::errors::InvalidArgument(
+                            "bitwise_not should have at least 1 argument"));
       return b_->CreateNot(Visit(&op->args[0]));
     } else if (func_name == "left_shift") {
-      CHECK_GE(op->args.size(), 2U);
+      PADDLE_ENFORCE_GE(op->args.size(),
+                        2U,
+                        phi::errors::InvalidArgument(
+                            "left_shift should have at least 2 arguments"));
       return b_->CreateShl(Visit(&op->args[0]), Visit(&op->args[1]));
     } else if (func_name == "right_shift") {
-      CHECK_GE(op->args.size(), 2U);
+      PADDLE_ENFORCE_GE(op->args.size(),
+                        2U,
+                        phi::errors::InvalidArgument(
+                            "right_shift should have at least 2 arguments"));
       if (op->args[0]->type().is_int()) {
         return b_->CreateAShr(Visit(&op->args[0]), Visit(&op->args[1]));
       } else {
         return b_->CreateLShr(Visit(&op->args[0]), Visit(&op->args[1]));
       }
     } else if (func_name == "isnan") {
-      CHECK_GE(op->args.size(), 1U);
+      PADDLE_ENFORCE_GE(op->args.size(),
+                        1U,
+                        phi::errors::InvalidArgument(
+                            "isnan should have at least 1 argument"));
       llvm::Value *v = Visit(&op->args[0]);
       return b_->CreateFCmpUNO(v, v);
     }
diff --git a/paddle/cinn/backends/llvm/codegen_llvm_test.cc b/paddle/cinn/backends/llvm/codegen_llvm_test.cc
index 930e70f22e869..074e960aba678 100644
--- a/paddle/cinn/backends/llvm/codegen_llvm_test.cc
+++ b/paddle/cinn/backends/llvm/codegen_llvm_test.cc
@@ -21,12 +21,12 @@
 #include <llvm/Support/MemoryBuffer.h>
 #include <llvm/Support/SourceMgr.h>
 #include <llvm/Support/raw_ostream.h>
-
 #include <algorithm>
 #include <iomanip>
 #include <memory>
 #include <utility>
 #include <vector>
+#include "paddle/common/enforce.h"
 
 #include "paddle/cinn/backends/llvm/cinn_runtime_llvm_ir.h"
 #include "paddle/cinn/cinn.h"
@@ -96,7 +96,10 @@ auto CreateIrBuffer(cinn::common::Type t,
                     std::string name,
                     std::vector<int> shape,
                     int data_alignment = 0) {
-  CHECK_GE(data_alignment, 0);
+  PADDLE_ENFORCE_GE(data_alignment,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "data_alignment should be greater than or equal to 0"));
   auto buffer = ir::_Buffer_::Make(std::move(name), std::move(t));
 
   if (data_alignment) {
diff --git a/paddle/cinn/backends/llvm/codegen_x86.cc b/paddle/cinn/backends/llvm/codegen_x86.cc
index cfd796162241c..5987e3af7a7c3 100644
--- a/paddle/cinn/backends/llvm/codegen_x86.cc
+++ b/paddle/cinn/backends/llvm/codegen_x86.cc
@@ -30,7 +30,7 @@
 #include "paddle/cinn/ir/op/ir_operators.h"
 #include "paddle/cinn/ir/utils/ir_nodes_collector.h"
 #include "paddle/cinn/runtime/intrinsic.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn::backends {
 
 CodeGenX86::CodeGenX86(llvm::Module* m,
@@ -144,8 +144,10 @@ void CodeGenX86::CreateParallelLaunch(Expr body, int num_task) {
   symbol_table_->PopScope();
   std::swap(parallel_env_, par_env);
   std::swap(f_, f);
-  CHECK_NE(par_env.parallel_loop_count, 0)
-      << "find no parallel loop within parallel launch";
+  PADDLE_ENFORCE_NE(par_env.parallel_loop_count,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "find no parallel loop within parallel launch"));
   b_->SetInsertPoint(launch_end);
 }
 
diff --git a/paddle/cinn/backends/llvm/execution_engine.cc b/paddle/cinn/backends/llvm/execution_engine.cc
index 050fd4e0d8389..8a84d69a1d7a0 100644
--- a/paddle/cinn/backends/llvm/execution_engine.cc
+++ b/paddle/cinn/backends/llvm/execution_engine.cc
@@ -166,17 +166,20 @@ std::unique_ptr<llvm::MemoryBuffer> NaiveObjectCache::getObject(
 
   VLOG(2) << "===================== Create CINN ExecutionEngine end "
              "====================";
+  engine->ctx = std::make_unique<llvm::LLVMContext>();
+  engine->b = std::make_unique<llvm::IRBuilder<>>(*engine->ctx);
+  llvm::SMDiagnostic error;
+  engine->m = llvm::parseAssemblyString(
+      AsStringRef(backends::kRuntimeLlvmIr), error, *engine->ctx);
+
   return engine;
 }
 
 template <typename CodeGenT>
-void ExecutionEngine::Link(const ir::Module &module) {
+void ExecutionEngine::Link(const ir::Module &module, bool add_module) {
   utils::RecordEvent("ExecutionEngine Link", utils::EventType::kOrdinary);
   llvm::SMDiagnostic error;
-  auto ctx = std::make_unique<llvm::LLVMContext>();
-  auto m = llvm::parseAssemblyString(
-      AsStringRef(backends::kRuntimeLlvmIr), error, *ctx);
-  auto b = std::make_unique<llvm::IRBuilder<>>(*ctx);
+
   auto ir_emitter = std::make_unique<CodeGenT>(m.get(), b.get());
   VLOG(3) << "ir_emitter->Compile(module) Begin";
   ir_emitter->Compile(module);
@@ -200,7 +203,9 @@ void ExecutionEngine::Link(const ir::Module &module) {
       pass_manager, rawstream, nullptr, llvm::CGFT_ObjectFile);
   pass_manager.run(*m);
 
-  CHECK(AddModule(std::move(m), std::move(ctx)));
+  if (add_module) {
+    AddSelfModule();
+  }
 
   if (VLOG_IS_ON(5)) {
     VLOG(5) << "======= dump jit execution session ======";
@@ -231,6 +236,9 @@ bool ExecutionEngine::AddModule(std::unique_ptr<llvm::Module> module,
   llvm::cantFail(jit_->addIRModule(std::move(tsm)));
   return true;
 }
+bool ExecutionEngine::AddSelfModule() {
+  return AddModule(std::move(m), std::move(ctx));
+}
 
 void ExecutionEngine::ExportObject(const std::string &path) {
   FILE *of = fopen(path.c_str(), "w");
@@ -268,8 +276,11 @@ void ExecutionEngine::RegisterRuntimeSymbols() {
   }
 }
 
-template void ExecutionEngine::Link<CodeGenLLVM>(const ir::Module &module);
-template void ExecutionEngine::Link<CodeGenX86>(const ir::Module &module);
-template void ExecutionEngine::Link<CodeGenCUDA_Host>(const ir::Module &module);
+template void ExecutionEngine::Link<CodeGenLLVM>(const ir::Module &module,
+                                                 bool add_module);
+template void ExecutionEngine::Link<CodeGenX86>(const ir::Module &module,
+                                                bool add_module);
+template void ExecutionEngine::Link<CodeGenCUDA_Host>(const ir::Module &module,
+                                                      bool add_module);
 
 }  // namespace cinn::backends
diff --git a/paddle/cinn/backends/llvm/execution_engine.h b/paddle/cinn/backends/llvm/execution_engine.h
index 63f9427a53edb..44b212f245f90 100644
--- a/paddle/cinn/backends/llvm/execution_engine.h
+++ b/paddle/cinn/backends/llvm/execution_engine.h
@@ -79,18 +79,22 @@ class ExecutionEngine {
   void *Lookup(absl::string_view name);
 
   template <typename CodeGenT = CodeGenLLVM>
-  void Link(const ir::Module &module);
+  void Link(const ir::Module &module, bool add_module = true);
 
   void ExportObject(const std::string &path);
 
   bool AddModule(std::unique_ptr<llvm::Module> module,
                  std::unique_ptr<llvm::LLVMContext> context);
 
+  bool AddSelfModule();
+
  protected:
   explicit ExecutionEngine(bool enable_object_cache,
                            RuntimeSymbols &&module_symbols)
       : cache_(std::make_unique<NaiveObjectCache>()),
-        module_symbols_(std::move(module_symbols)) {}
+        module_symbols_(std::move(module_symbols)),
+        ctx(std::make_unique<llvm::LLVMContext>()),
+        b(std::make_unique<llvm::IRBuilder<>>(*ctx)) {}
 
   void RegisterRuntimeSymbols();
 
@@ -106,6 +110,10 @@ class ExecutionEngine {
   std::unique_ptr<llvm::orc::LLJIT> jit_;
   std::unique_ptr<NaiveObjectCache> cache_;
   RuntimeSymbols module_symbols_;
+
+  std::unique_ptr<llvm::LLVMContext> ctx;
+  std::unique_ptr<llvm::Module> m;
+  std::unique_ptr<llvm::IRBuilder<>> b;
 };
 
 }  // namespace cinn::backends
diff --git a/paddle/cinn/backends/llvm/execution_engine_test.cc b/paddle/cinn/backends/llvm/execution_engine_test.cc
index a13f329a81259..beb3ec61fae25 100644
--- a/paddle/cinn/backends/llvm/execution_engine_test.cc
+++ b/paddle/cinn/backends/llvm/execution_engine_test.cc
@@ -26,7 +26,6 @@
 #include <llvm/Support/MemoryBuffer.h>
 #include <llvm/Support/SourceMgr.h>
 #include <llvm/Support/raw_ostream.h>
-
 #include <algorithm>
 #include <cmath>
 #include <iomanip>
@@ -35,6 +34,7 @@
 #include <tuple>
 #include <utility>
 #include <vector>
+#include "paddle/common/enforce.h"
 
 #include "paddle/cinn/backends/llvm/cinn_runtime_llvm_ir.h"
 #include "paddle/cinn/backends/llvm/codegen_llvm.h"
@@ -91,7 +91,11 @@ auto CreateTestBuffer() {
   }
 
   float *Cd = reinterpret_cast<float *>(C->memory);
-  CHECK_EQ(C->num_elements(), A->num_elements());
+  PADDLE_ENFORCE_EQ(
+      C->num_elements(),
+      A->num_elements(),
+      phi::errors::InvalidArgument(
+          "The number of elements of C and A should be the same."));
 
   return std::make_tuple(A, B, C);
 }
diff --git a/paddle/cinn/backends/llvm/llvm_intrin_rule.h b/paddle/cinn/backends/llvm/llvm_intrin_rule.h
index 903c056196f4e..14e3718299c0f 100644
--- a/paddle/cinn/backends/llvm/llvm_intrin_rule.h
+++ b/paddle/cinn/backends/llvm/llvm_intrin_rule.h
@@ -26,17 +26,24 @@
 #include "paddle/cinn/ir/intrinsic_ops.h"
 #include "paddle/cinn/ir/registry.h"
 #include "paddle/cinn/lang/packed_func.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace codegen {
 
 template <int id, int arg_nums, bool add_float_suffix = true>
 inline void MakeFloatIntrinOp(lang::Args args, lang::RetValue *rv) {
-  CHECK_GE(args.size(), 1U);
+  PADDLE_ENFORCE_GE(args.size(),
+                    1U,
+                    phi::errors::InvalidArgument(
+                        "The number of args should be greater than 1."));
   Expr arg = args[0];
   ir::Call *node = arg->as<ir::Call>();
   CHECK(node);
-  CHECK_GE(node->read_args.size(), arg_nums);
+  PADDLE_ENFORCE_GE(
+      node->read_args.size(),
+      arg_nums,
+      phi::errors::InvalidArgument(
+          "The number of read args should be greater than arg_nums."));
   if (add_float_suffix) {
     CHECK(node->type().is_float());
     *rv = ir::intrinsics::BuiltinIntrin::Make(
@@ -85,7 +92,10 @@ void RegisterCpuIntrinRule() {
 
   ir::Registry::Register("lower_cpu_intrinsic_isfinite", true)
       .SetBody([](lang::Args args, lang::RetValue *rv) {
-        CHECK_GE(args.size(), 1U);
+        PADDLE_ENFORCE_GE(args.size(),
+                          1U,
+                          phi::errors::InvalidArgument(
+                              "The number of args should be greater than 1."));
         Expr arg0 = args[0];
         ir::Call *node = arg0->as<ir::Call>();
         CHECK(node);
@@ -96,7 +106,10 @@ void RegisterCpuIntrinRule() {
 
   ir::Registry::Register("lower_cpu_intrinsic_isinf", true)
       .SetBody([](lang::Args args, lang::RetValue *rv) {
-        CHECK_GE(args.size(), 1U);
+        PADDLE_ENFORCE_GE(args.size(),
+                          1U,
+                          phi::errors::InvalidArgument(
+                              "The number of args should be greater than 1."));
         Expr arg0 = args[0];
         ir::Call *node = arg0->as<ir::Call>();
         CHECK(node);
@@ -113,7 +126,10 @@ void RegisterCpuIntrinRule() {
 
   ir::Registry::Register("lower_cpu_intrinsic_rsqrt", true)
       .SetBody([](lang::Args args, lang::RetValue *rv) {
-        CHECK_GE(args.size(), 1U);
+        PADDLE_ENFORCE_GE(args.size(),
+                          1U,
+                          phi::errors::InvalidArgument(
+                              "The number of args should be greater than 1."));
         Expr arg0 = args[0];
         ir::Call *node = arg0->as<ir::Call>();
         CHECK(node);
@@ -124,7 +140,10 @@ void RegisterCpuIntrinRule() {
 
   ir::Registry::Register("lower_cpu_intrinsic_exp10", true)
       .SetBody([](lang::Args args, lang::RetValue *rv) {
-        CHECK_GE(args.size(), 1U);
+        PADDLE_ENFORCE_GE(args.size(),
+                          1U,
+                          phi::errors::InvalidArgument(
+                              "The number of args should be greater than 1."));
         Expr arg0 = args[0];
         ir::Call *node = arg0->as<ir::Call>();
         CHECK(node);
@@ -136,7 +155,10 @@ void RegisterCpuIntrinRule() {
 
   ir::Registry::Register("lower_cpu_intrinsic_tan", true)
       .SetBody([](lang::Args args, lang::RetValue *rv) {
-        CHECK_GE(args.size(), 1U);
+        PADDLE_ENFORCE_GE(args.size(),
+                          1U,
+                          phi::errors::InvalidArgument(
+                              "The number of args should be greater than 1."));
         Expr arg0 = args[0];
         ir::Call *node = arg0->as<ir::Call>();
         CHECK(node);
@@ -147,7 +169,10 @@ void RegisterCpuIntrinRule() {
 
   ir::Registry::Register("lower_cpu_intrinsic_tanh", true)
       .SetBody([](lang::Args args, lang::RetValue *rv) {
-        CHECK_GE(args.size(), 1U);
+        PADDLE_ENFORCE_GE(args.size(),
+                          1U,
+                          phi::errors::InvalidArgument(
+                              "The number of args should be greater than 1."));
         Expr arg0 = args[0];
         ir::Call *node = arg0->as<ir::Call>();
         CHECK(node);
@@ -168,7 +193,10 @@ void RegisterCpuIntrinRule() {
 
   ir::Registry::Register("lower_cpu_intrinsic_cosh", true)
       .SetBody([](lang::Args args, lang::RetValue *rv) {
-        CHECK_GE(args.size(), 1U);
+        PADDLE_ENFORCE_GE(args.size(),
+                          1U,
+                          phi::errors::InvalidArgument(
+                              "The number of args should be greater than 1."));
         Expr arg0 = args[0];
         ir::Call *node = arg0->as<ir::Call>();
         CHECK(node);
@@ -180,7 +208,10 @@ void RegisterCpuIntrinRule() {
 
   ir::Registry::Register("lower_cpu_intrinsic_sinh", true)
       .SetBody([](lang::Args args, lang::RetValue *rv) {
-        CHECK_GE(args.size(), 1U);
+        PADDLE_ENFORCE_GE(args.size(),
+                          1U,
+                          phi::errors::InvalidArgument(
+                              "The number of args should be greater than 1."));
         Expr arg0 = args[0];
         ir::Call *node = arg0->as<ir::Call>();
         CHECK(node);
diff --git a/paddle/cinn/backends/llvm/llvm_optimizer.cc b/paddle/cinn/backends/llvm/llvm_optimizer.cc
index e64fb9f42ee0b..22f9a37351664 100644
--- a/paddle/cinn/backends/llvm/llvm_optimizer.cc
+++ b/paddle/cinn/backends/llvm/llvm_optimizer.cc
@@ -74,12 +74,12 @@ class CustomPassManager : public PassManagerT {
   void add(llvm::Pass *pass) override {
     if (print_passes_) {
       if (is_function_pass_manager_) {
-        VLOG(1) << "llvm run function pass[" << std::string(pass->getPassName())
+        VLOG(4) << "llvm run function pass[" << std::string(pass->getPassName())
                 << "]";
       }
 
       if (is_module_pass_manager_) {
-        VLOG(1) << "llvm run module pass[" << std::string(pass->getPassName())
+        VLOG(4) << "llvm run module pass[" << std::string(pass->getPassName())
                 << "]";
       }
     }
diff --git a/paddle/cinn/backends/llvm/runtime_symbol_registry.cc b/paddle/cinn/backends/llvm/runtime_symbol_registry.cc
index 3885ebe0c4199..52dbe7f024307 100644
--- a/paddle/cinn/backends/llvm/runtime_symbol_registry.cc
+++ b/paddle/cinn/backends/llvm/runtime_symbol_registry.cc
@@ -20,8 +20,8 @@
 #include <iostream>
 
 #include "paddle/cinn/runtime/flags.h"
+#include "paddle/common/enforce.h"
 #include "paddle/common/flags.h"
-
 PD_DECLARE_bool(verbose_function_register);
 
 namespace cinn {
@@ -51,8 +51,10 @@ void RuntimeSymbols::Register(const std::string &name, void *address) {
   std::lock_guard<std::mutex> lock(mu_);
   auto it = symbols_.find(name);
   if (it != symbols_.end()) {
-    CHECK_EQ(it->second, address)
-        << "Duplicate register symbol [" << name << "]";
+    PADDLE_ENFORCE_EQ(
+        it->second,
+        address,
+        phi::errors::InvalidArgument("Duplicate register symbol"));
     return;
   }
 
diff --git a/paddle/cinn/backends/modular.cc b/paddle/cinn/backends/modular.cc
index fb736154c7bfc..f735b8b6da56a 100644
--- a/paddle/cinn/backends/modular.cc
+++ b/paddle/cinn/backends/modular.cc
@@ -15,7 +15,7 @@
 #include "paddle/cinn/backends/modular.h"
 
 #include "paddle/cinn/ir/ir_visitor.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace backends {
 
@@ -104,8 +104,14 @@ class ModularEvaluator : public ir::IRVisitorRequireReImpl<ModularEntry> {
   }
 
   static int gcd(int a, int b) {
-    CHECK_GE(a, 0);
-    CHECK_GE(b, 0);
+    PADDLE_ENFORCE_GE(
+        a,
+        0,
+        phi::errors::InvalidArgument("a should be greater than or equal to 0"));
+    PADDLE_ENFORCE_GE(
+        b,
+        0,
+        phi::errors::InvalidArgument("b should be greater than or equal to 0"));
     if (a < b) std::swap(a, b);
     if (b == 0) return a;
 
diff --git a/paddle/cinn/backends/nvrtc/header_generator.cc b/paddle/cinn/backends/nvrtc/header_generator.cc
index d4b2b9504673f..7d88ed16d0413 100644
--- a/paddle/cinn/backends/nvrtc/header_generator.cc
+++ b/paddle/cinn/backends/nvrtc/header_generator.cc
@@ -16,7 +16,7 @@
 
 #include "glog/logging.h"
 #include "jitify.hpp"  // NOLINT
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace backends {
 namespace nvrtc {
@@ -27,8 +27,10 @@ HeaderGeneratorBase& JitSafeHeaderGenerator::GetInstance() {
 }
 
 const size_t JitSafeHeaderGenerator::size() const {
-  CHECK_EQ(include_names_.size(), headers_.size())
-      << "Internal error in size of header files.";
+  PADDLE_ENFORCE_EQ(
+      include_names_.size(),
+      headers_.size(),
+      phi::errors::InvalidArgument("Internal error in size of header files."));
   return include_names_.size();
 }
 
diff --git a/paddle/cinn/backends/nvrtc/nvrtc_util.cc b/paddle/cinn/backends/nvrtc/nvrtc_util.cc
index 737d887ea809c..1b887268a1ae8 100644
--- a/paddle/cinn/backends/nvrtc/nvrtc_util.cc
+++ b/paddle/cinn/backends/nvrtc/nvrtc_util.cc
@@ -29,7 +29,7 @@
 #include "paddle/cinn/common/common.h"
 #include "paddle/cinn/runtime/flags.h"
 #include "paddle/cinn/utils/string.h"
-
+#include "paddle/common/enforce.h"
 PD_DECLARE_string(cinn_nvcc_cmd_path);
 PD_DECLARE_string(nvidia_package_dir);
 PD_DECLARE_bool(nvrtc_compile_to_cubin);
@@ -187,7 +187,9 @@ std::string Compiler::CompileCudaSource(const std::string& code,
     std::string log;
     log.resize(log_size);
     NVRTC_CALL(nvrtcGetProgramLog(prog, &log[0]));
-    CHECK_EQ(compile_res, NVRTC_SUCCESS) << log << "\nThe code is:\n" << code;
+    PADDLE_ENFORCE_EQ(compile_res,
+                      NVRTC_SUCCESS,
+                      phi::errors::Fatal("NVRTC compilation failed"));
   }
 
   size_t size;
diff --git a/paddle/cinn/common/cas.h b/paddle/cinn/common/cas.h
index 7fbd0bfe6aa00..2d796c639406f 100755
--- a/paddle/cinn/common/cas.h
+++ b/paddle/cinn/common/cas.h
@@ -51,12 +51,12 @@ struct CasInterval {
    * 1 <= iterator_i <= 5
    */
   CasInterval(Expr expr_l, Expr expr_r) {
-    VLOG(2) << "CasInterval is : [" << expr_l << ", " << expr_r << "].";
+    VLOG(6) << "CasInterval is : [" << expr_l << ", " << expr_r << "].";
     expr_r = detail::ReplaceMinToConstant(expr_r);
     expr_l = detail::ReplaceMaxToConstant(expr_l);
     optim::Simplify(&expr_l);
     optim::Simplify(&expr_r);
-    VLOG(2) << "After simplify, CasInterval is : [" << expr_l << ", " << expr_r
+    VLOG(6) << "After simplify, CasInterval is : [" << expr_l << ", " << expr_r
             << "].";
 
     if (expr_l.is_constant() && expr_r.is_constant()) {
diff --git a/paddle/cinn/hlir/dialect/operator/ir/manual_op.cc b/paddle/cinn/hlir/dialect/operator/ir/manual_op.cc
index 36fe9e340fcd9..5e7d3e6d876cf 100644
--- a/paddle/cinn/hlir/dialect/operator/ir/manual_op.cc
+++ b/paddle/cinn/hlir/dialect/operator/ir/manual_op.cc
@@ -324,12 +324,12 @@ void SplitOp::Build(pir::Builder& builder,             // NOLINT
 const char* GenerateShapeOp::attributes_name[attributes_num] = {
     "output_dim_exprs", "symbol_bindings"};
 
-void GenerateShapeOp::Build(
-    pir::Builder& builder,
-    pir::OperationArgument& argument,
-    const std::vector<pir::Value>& inputs,
-    const std::vector<pir::Attribute>& output_dim_exprs,
-    const GenerateShapeOp::SymbolBindings& symbol_bindings) {
+void GenerateShapeOp::Build(pir::Builder& builder,
+                            pir::OperationArgument& argument,
+                            const std::vector<pir::Value>& inputs,
+                            const std::vector<pir::Attribute>& output_dim_exprs,
+                            const SymbolBindings& symbol_bindings,
+                            const pir::Type& output_type) {
   if (inputs.empty()) {
     VLOG(3) << "GenerateShapeOp inputs is empty";
     for (const auto& attr : output_dim_exprs) {
@@ -344,13 +344,7 @@ void GenerateShapeOp::Build(
   argument.AddAttribute(
       "symbol_bindings",
       ConvertSymbolBindingsToAttribute(builder, symbol_bindings));
-  argument.AddOutputs({[&]() {
-    auto* ctx = pir::IrContext::Instance();
-    auto type = pir::Int64Type::get(ctx);
-    auto dim =
-        ::common::make_ddim({static_cast<int64_t>(output_dim_exprs.size())});
-    return DenseTensorType::get(ctx, type, dim);
-  }()});
+  argument.AddOutput(output_type);
   ::pir::PassStopGradientsDefaultly(argument);
 }
 
diff --git a/paddle/cinn/hlir/dialect/operator/ir/manual_op.h b/paddle/cinn/hlir/dialect/operator/ir/manual_op.h
index 1eddfaffd0df1..06f306a0e3623 100644
--- a/paddle/cinn/hlir/dialect/operator/ir/manual_op.h
+++ b/paddle/cinn/hlir/dialect/operator/ir/manual_op.h
@@ -168,7 +168,8 @@ class IR_API GenerateShapeOp
                     pir::OperationArgument &argument,  // NOLINT
                     const std::vector<pir::Value> &inputs,
                     const std::vector<pir::Attribute> &output_dim_exprs,
-                    const SymbolBindings &symbol_bindings);
+                    const SymbolBindings &symbol_bindings,
+                    const pir::Type &output_type);
 
   void VerifySig() {}
 
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_pass.cc
index 63d5b519ce887..ec82d41742a70 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_pass.cc
@@ -232,7 +232,7 @@ class BlockDimExprsAsserter {
     };
     std::vector<pir::Value> input_tensors{};
     std::vector<pir::Attribute> output_dim_expr_attrs{};
-    GenerateShapeOp::SymbolBindings symbol_bindings{};
+    SymbolBindings symbol_bindings{};
     bool success =
         MakeGenerateShapeOpAttribute(ir_ctx_,
                                      LocalDimExprs4Value,
@@ -242,14 +242,13 @@ class BlockDimExprsAsserter {
                                      &output_dim_expr_attrs,
                                      &symbol_bindings);
     if (!success) return std::nullopt;
-    auto out_shape_value =
-        builder_
-            .Build<cinn::dialect::GenerateShapeOp>(
-                input_tensors, output_dim_expr_attrs, symbol_bindings)
-            .out();
+    auto out_type = paddle::dialect::DenseTensorType::get(
+        builder_.ir_context(),
+        pir::Int64Type::get(builder_.ir_context()),
+        ::common::make_ddim({dim_exprs.size()}));
     return builder_
         .Build<cinn::dialect::GenerateShapeOp>(
-            input_tensors, output_dim_expr_attrs, symbol_bindings)
+            input_tensors, output_dim_expr_attrs, symbol_bindings, out_type)
         .out();
   }
 
@@ -298,8 +297,11 @@ class BlockDimExprsAsserter {
     PADDLE_ENFORCE_EQ(lhs_numel,
                       rhs_numel,
                       ::common::errors::InvalidArgument(
+                          "Check [%s id:%d] infer symbolic shape failed."
                           "The numel of lhs and rhs must be equal, but "
                           "received lhs's numel is [%d], rhs's numel is [%d]",
+                          op->name(),
+                          op->id(),
                           lhs_numel,
                           rhs_numel));
 
@@ -326,8 +328,8 @@ class BlockDimExprsAsserter {
             .out();
     auto assert_op = builder_.Build<paddle::dialect::AssertOp>(
         all_eq, assert_data, lhs_numel);
-    const std::string error_msg = "Check [" + op->name() + "_" +
-                                  std::to_string(op->id()) +
+    const std::string error_msg = "Check [" + op->name() +
+                                  " id:" + std::to_string(op->id()) +
                                   "] infer symbolic shape failed.";
     assert_op->set_attribute(
         paddle::dialect::AssertOp::ERROR_INFO_ATTR_NAME,
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/cinn_to_pd_util.cc b/paddle/cinn/hlir/dialect/operator/transforms/cinn_to_pd_util.cc
index 6281baeadbef2..ca422c1a593c8 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/cinn_to_pd_util.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/cinn_to_pd_util.cc
@@ -190,6 +190,15 @@ ::pir::Operation* ConvertConcatOp(::pir::Operation* op,
   return pd_op;
 }
 
+::pir::Operation* ConvertGenerateShapeOp(
+    ::pir::Operation* op,
+    ::pir::IrMapping& ir_mapping,  // NOLINT
+    ::pir::Builder& builder) {     // NOLINT
+  auto* new_op = op->Clone(ir_mapping, {true, true, true});
+  builder.Insert(new_op);
+  return new_op;
+}
+
 ::pir::Operation* ConvertScaleOp(::pir::Operation* op,
                                  ::pir::IrMapping& ir_mapping,        // NOLINT
                                  ::pir::PatternRewriter& rewriter) {  // NOLINT
@@ -404,6 +413,9 @@ REGISTER_TRANSFORM_RULES(concat_op,
                          cinn::dialect::ConcatOp::name(),
                          cinn::dialect::details::ConvertConcatOp);
 
+REGISTER_TRANSFORM_RULES(generate_shape_op,
+                         cinn::dialect::GenerateShapeOp::name(),
+                         cinn::dialect::details::ConvertGenerateShapeOp);
 REGISTER_TRANSFORM_RULES(scale_op,
                          cinn::dialect::ScaleOp::name(),
                          cinn::dialect::details::ConvertScaleOp);
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/dynamic_reshape_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/dynamic_reshape_pass.cc
index 17317924fb07e..0ffd284ac79f7 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/dynamic_reshape_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/dynamic_reshape_pass.cc
@@ -38,9 +38,10 @@ bool ReplaceOpWithReshapeOp(pir::Operation* op,
   std::vector<pir::Attribute> output_dim_expr_attrs{};
   GenerateShapeOp::SymbolBindings symbol_bindings{};
 
-  unsigned output_dim_idx = 0, input_dim_idx = 0;
   int64_t local_dim_expr_id = 0;
-  for (; output_dim_idx < output_shape.size(); ++output_dim_idx) {
+  for (unsigned output_dim_idx = 0, input_dim_idx = 0;
+       output_dim_idx < output_shape.size();
+       ++output_dim_idx) {
     const auto& dim_expr = output_shape.at(output_dim_idx);
     if (dim_expr.isa<int64_t>()) {
       output_dim_expr_attrs.emplace_back(
@@ -64,8 +65,16 @@ bool ReplaceOpWithReshapeOp(pir::Operation* op,
       }
     }
   }
+  auto out_type = paddle::dialect::DenseTensorType::get(
+      rewriter.ir_context(),
+      pir::Int64Type::get(rewriter.ir_context()),
+      ::common::make_ddim(
+          {static_cast<int64_t>(output_dim_expr_attrs.size())}));
   auto cinn_generate_shape = rewriter.Build<cinn::dialect::GenerateShapeOp>(
-      std::vector<pir::Value>{input}, output_dim_expr_attrs, symbol_bindings);
+      std::vector<pir::Value>{input},
+      output_dim_expr_attrs,
+      symbol_bindings,
+      out_type);
   auto pd_reshape = rewriter.Build<paddle::dialect::ReshapeOp>(
       op->operand_source(0), cinn_generate_shape.result(0));
 
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/fuse_shape_ops_into_generate_shape_op_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/fuse_shape_ops_into_generate_shape_op_pass.cc
index 0578c79b35a2b..473763bb4dcec 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/fuse_shape_ops_into_generate_shape_op_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/fuse_shape_ops_into_generate_shape_op_pass.cc
@@ -313,9 +313,18 @@ std::optional<pir::Value> GetOutOfRewrittenGenerateShapeOp(
                                               &output_dim_expr_attrs,
                                               &symbol_bindings);
   if (!success) return std::nullopt;
+  auto out_type = [&]() -> pir::Type {
+    if (shape.type().isa<paddle::dialect::DenseTensorType>()) {
+      return shape.type();
+    }
+    return paddle::dialect::DenseTensorType::get(
+        rewriter->ir_context(),
+        pir::Int64Type::get(rewriter->ir_context()),
+        ::common::make_ddim({output_dim_expr_attrs.size()}));
+  }();
   return rewriter
       ->Build<cinn::dialect::GenerateShapeOp>(
-          input_tensors, output_dim_expr_attrs, symbol_bindings)
+          input_tensors, output_dim_expr_attrs, symbol_bindings, out_type)
       .out();
 }
 
@@ -323,9 +332,8 @@ bool ReplaceShapeOpsToGenerateShape(
     pir::OpOperand shape_operand,
     pir::PatternRewriter* rewriter,
     pir::ShapeConstraintIRAnalysis* shape_analysis) {
-  if (shape_operand.source()
-          .defining_op()
-          ->isa<cinn::dialect::GenerateShapeOp>()) {
+  auto* shape_def_op = shape_operand.source().defining_op();
+  if (!shape_def_op || shape_def_op->isa<cinn::dialect::GenerateShapeOp>()) {
     return false;
   }
   auto ShapeOrDataDimExprs4Value =
@@ -379,6 +387,82 @@ class FuseShapeOpsIntoGenerateShapeOpPattern
   }
 };
 
+class FuseSingleElementShapeOpsIntoGenerateShapeOpPattern
+    : public pir::RewritePattern {
+ public:
+  explicit FuseSingleElementShapeOpsIntoGenerateShapeOpPattern(
+      pir::IrContext* context)
+      : pir::RewritePattern(MatchAnyOpTypeTag(),
+                            1 /*benefit*/,
+                            context,
+                            {} /*generated_names*/) {}
+
+  bool Match(pir::Operation* op) const override {
+    auto& shape_analysis =
+        pir::ShapeAnalysisManager::Instance().Get(op->GetParentProgram());
+    if (!IsSingleElementShapeOp(op, &shape_analysis)) return false;
+    if (op->isa<cinn::dialect::GenerateShapeOp>()) return false;
+
+    // all user op's output should has no data of shape expr
+    pir::Value output = op->result(0);
+    if (output.use_empty()) return false;
+    for (auto iter = output.use_begin(); iter != output.use_end(); ++iter) {
+      auto* user = iter->owner();
+      if (IsSingleElementShapeOp(user, &shape_analysis)) return false;
+      if (user->isa<cinn::dialect::GenerateShapeOp>()) return false;
+    }
+
+    return true;
+  }
+
+  void Rewrite(pir::Operation* op,
+               pir::PatternRewriter& rewriter) const override {
+    auto& shape_analysis =
+        pir::ShapeAnalysisManager::Instance().Get(op->GetParentProgram());
+
+    auto ShapeOrDataDimExprs4Value =
+        [&shape_analysis](
+            pir::Value value) -> const symbol::ShapeOrDataDimExprs& {
+      return shape_analysis.GetShapeOrDataForValue(value);
+    };
+    std::optional<pir::Value> opt_generated_shape =
+        GetOutOfRewrittenGenerateShapeOp(
+            op->result(0), &rewriter, ShapeOrDataDimExprs4Value);
+    if (!opt_generated_shape.has_value()) {
+      LOG(WARNING) << "Create GenerateShapeOp Failed.";
+      return;
+    }
+
+    rewriter.ReplaceAllUsesWith(op->result(0), opt_generated_shape.value());
+
+    if (op->use_empty()) {
+      rewriter.EraseOp(op);
+    }
+  }
+
+ private:
+  bool IsSingleElementShapeOp(
+      pir::Operation* op,
+      pir::ShapeConstraintIRAnalysis* shape_analysis) const {
+    if (op->num_operands() == 0) return false;
+    if (op->num_results() != 1) return false;
+
+    pir::Value output = op->result(0);
+    const auto& out_shape = shape_analysis->GetShapeOrDataForValue(output);
+    if (!out_shape.isa<symbol::TensorShapeOrDataDimExprs>()) return false;
+    if (!out_shape.data().has_value()) return false;
+
+    auto dtype =
+        output.type().dyn_cast<paddle::dialect::DenseTensorType>().dtype();
+    if (!dtype.isa<pir::Int32Type>() && !dtype.isa<pir::Int64Type>()) {
+      return false;
+    }
+
+    // Only process the op which output is a single element
+    return out_shape.data()->size() == 1;
+  }
+};
+
 class FuseShapeOpsIntoGenerateShapeOpPass : public pir::PatternRewritePass {
  public:
   FuseShapeOpsIntoGenerateShapeOpPass()
@@ -393,6 +477,7 @@ class FuseShapeOpsIntoGenerateShapeOpPass : public pir::PatternRewritePass {
         context);
     ps.Add<FuseShapeOpsIntoGenerateShapeOpPattern<paddle::dialect::SliceOp>>(
         context);
+    ps.Add<FuseSingleElementShapeOpsIntoGenerateShapeOpPattern>(context);
     return ps;
   }
 
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/generate_shape_util.cc b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/generate_shape_util.cc
index 30b470d42ca2a..f2afbae3d515d 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/generate_shape_util.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/generate_shape_util.cc
@@ -83,8 +83,10 @@ std::optional<pir::Value> InsertGenerateShapeOpToRunFirst(
                                    &symbol_bindings);
   if (success) {
     return builder
-        ->Build<cinn::dialect::GenerateShapeOp>(
-            minimal_inputs, output_dim_expr_attrs, symbol_bindings)
+        ->Build<cinn::dialect::GenerateShapeOp>(minimal_inputs,
+                                                output_dim_expr_attrs,
+                                                symbol_bindings,
+                                                value.type())
         .out();
   }
   return std::nullopt;
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/broadcast_with_cf.cc b/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/broadcast_with_cf.cc
index 8f0bab178d75c..c3daa04fc2f4e 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/broadcast_with_cf.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/broadcast_with_cf.cc
@@ -233,17 +233,24 @@ std::tuple<pir::Value, pir::Value, pir::Value> BroadcastableToCondValue(
                                                   &rhs_symbol_bindings);
   CHECK(success);
 
+  auto out_type = paddle::dialect::DenseTensorType::get(
+      builder.ir_context(),
+      pir::Int64Type::get(builder.ir_context()),
+      ::common::make_ddim({1}));
+
   auto lhs_value =
       builder
           .Build<cinn::dialect::GenerateShapeOp>(lhs_minimal_inputs,
                                                  lhs_output_dim_expr_attrs,
-                                                 lhs_symbol_bindings)
+                                                 lhs_symbol_bindings,
+                                                 out_type)
           .out();
   auto rhs_value =
       builder
           .Build<cinn::dialect::GenerateShapeOp>(rhs_minimal_inputs,
                                                  rhs_output_dim_expr_attrs,
-                                                 rhs_symbol_bindings)
+                                                 rhs_symbol_bindings,
+                                                 out_type)
           .out();
 
   auto const_one = builder
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/utils.cc b/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/utils.cc
index a36c208f0c96c..c2604697d68af 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/utils.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/utils.cc
@@ -110,23 +110,26 @@ OpLoweringGroupPtr BuildOpLoweringGroup(pir::Operation* fusion_op_ptr) {
                           : group_op_kind;
     }
   }
-
-  auto group = std::make_shared<OpLoweringGroup>(ops);
-
-  if (fusion_op.attributes().count("group_info")) {
-    auto attr = fusion_op.attribute("group_info")
-                    .dyn_cast<cinn::dialect::GroupInfoAttribute>()
-                    .data();
-
-    group_op_kind =
-        static_cast<int>(attr.op_pattern_kind) > static_cast<int>(group_op_kind)
-            ? attr.op_pattern_kind
-            : group_op_kind;
-    group->set_loop_ranges(attr.loop_ranges);
-    group->set_loop_ranges_expr(attr.loop_ranges_expr);
-    group->set_reduce_axis(attr.reduce_axis);
-    group->set_alignment_schedule_info(attr.alignment_schedule_info);
-  }
+  PADDLE_ENFORCE_GT(fusion_op.attributes().count("group_info"),
+                    0UL,
+                    phi::errors::InvalidArgument(
+                        "fusion_op should have group_info attribute."));
+
+  const auto attr = fusion_op.attribute("group_info")
+                        .dyn_cast<cinn::dialect::GroupInfoAttribute>()
+                        .data();
+
+  const auto& fn_name = attr.fn_name;
+  auto group = std::make_shared<OpLoweringGroup>(ops, fn_name);
+
+  group_op_kind =
+      static_cast<int>(attr.op_pattern_kind) > static_cast<int>(group_op_kind)
+          ? attr.op_pattern_kind
+          : group_op_kind;
+  group->set_loop_ranges(attr.loop_ranges);
+  group->set_loop_ranges_expr(attr.loop_ranges_expr);
+  group->set_reduce_axis(attr.reduce_axis);
+  group->set_alignment_schedule_info(attr.alignment_schedule_info);
   group->set_op_pattern_kind(group_op_kind);
 
   // Rebuild output_ops and input_ops of the group
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc
index 648b3af363241..89ca95884fb52 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc
@@ -33,116 +33,128 @@ namespace dialect {
 namespace ir {
 using CompatibleInfo = cinn::hlir::framework::pir::CompatibleInfo;
 
-class SumOpPattern : public paddle::drr::DrrPatternBase {
- public:
-  std::string name() const override { return "SumOpPattern"; }
-
-  void operator()(paddle::drr::DrrPatternContext *ctx) const override {
-    // Source Pattern
-    paddle::drr::SourcePattern pattern = ctx->SourcePattern();
-    const auto &full_int_array =
-        pattern.Op(paddle::dialect::FullIntArrayOp::name(),
-                   {{"value", pattern.Attr("axis_info")},
-                    {"dtype", pattern.Attr("dtype_2")},
-                    {"place", pattern.Attr("place_2")}});
-
-    const auto &sum = pattern.Op(paddle::dialect::SumOp::name(),
-                                 {{"dtype", pattern.Attr("dtype")},
-                                  {"keepdim", pattern.Attr("keep_dim")}});
-    pattern.Tensor("ret") = sum(pattern.Tensor("arg0"), full_int_array());
-
-    // Result patterns
-    paddle::drr::ResultPattern res = pattern.ResultPattern();
-    const auto &cinn_reduce_sum =
-        res.Op(cinn::dialect::ReduceSumOp::name(),
-               {{"dim", pattern.Attr("axis_info")},
-                {"dtype", pattern.Attr("dtype")},
-                {"keep_dim", pattern.Attr("keep_dim")}});
-    res.Tensor("ret") = cinn_reduce_sum(res.Tensor("arg0"));
+namespace {
+
+template <typename T = int>
+std::vector<T> GetVectorFromIntArrayAttribute(
+    const pir::ArrayAttribute &array_attr) {
+  const auto &vector_attr = array_attr.AsVector();
+
+  std::vector<T> result;
+  if (vector_attr.size() > 0) {
+    PADDLE_ENFORCE_EQ(vector_attr[0].isa<::pir::Int64Attribute>(),
+                      true,
+                      phi::errors::Unimplemented(
+                          "the 0th elementwise MUST be ir::Int64Attribute"));
+    for (size_t i = 0; i < vector_attr.size(); ++i) {
+      result.push_back(vector_attr[i].dyn_cast<::pir::Int64Attribute>().data());
+    }
   }
-};
+  return result;
+}
 
-class MaxOpPattern : public paddle::drr::DrrPatternBase {
- public:
-  std::string name() const override { return "MaxOpPattern"; }
+}  // namespace
 
-  void operator()(paddle::drr::DrrPatternContext *ctx) const override {
-    // Source Pattern
-    paddle::drr::SourcePattern pattern = ctx->SourcePattern();
-    const auto &full_int_array =
-        pattern.Op(paddle::dialect::FullIntArrayOp::name(),
-                   {{"value", pattern.Attr("axis_info")},
-                    {"dtype", pattern.Attr("dtype_2")},
-                    {"place", pattern.Attr("place_2")}});
+class SumOpPattern : public pir::OpRewritePattern<paddle::dialect::SumOp> {
+ public:
+  using pir::OpRewritePattern<paddle::dialect::SumOp>::OpRewritePattern;
 
-    const auto &pd_max = pattern.Op(paddle::dialect::MaxOp::name(),
-                                    {{"keepdim", pattern.Attr("keep_dim")}});
-    pattern.Tensor("ret") = pd_max(pattern.Tensor("arg0"), full_int_array());
+  bool Match(paddle::dialect::SumOp op) const override {
+    if (CompatibleInfo::IsDeniedForCinn(*op.operation())) return false;
+    auto *axes_op = op->operand_source(1).defining_op();
+    return axes_op && axes_op->isa<paddle::dialect::FullIntArrayOp>();
+  }
 
-    // Result patterns
-    paddle::drr::ResultPattern res = pattern.ResultPattern();
-    const auto &cinn_reduce_max =
-        res.Op(cinn::dialect::ReduceMaxOp::name(),
-               {{"dim", pattern.Attr("axis_info")},
-                {"keep_dim", pattern.Attr("keep_dim")}});
-    res.Tensor("ret") = cinn_reduce_max(res.Tensor("arg0"));
+  void Rewrite(paddle::dialect::SumOp op,
+               pir::PatternRewriter &rewriter) const override {
+    auto *axes_op = op->operand_source(1).defining_op();
+    auto full_int_array_op =
+        axes_op->dyn_cast<paddle::dialect::FullIntArrayOp>();
+
+    // get attribute value from full_int_array op
+    const std::vector<int64_t> axis = GetVectorFromIntArrayAttribute<int64_t>(
+        full_int_array_op.attribute("value").dyn_cast<pir::ArrayAttribute>());
+    const bool keep_dim =
+        op.attribute("keepdim").dyn_cast<::pir::BoolAttribute>().data();
+    const auto &dtype = op.attribute("dtype")
+                            .dyn_cast<paddle::dialect::DataTypeAttribute>()
+                            .data();
+
+    auto cinn_reduce = rewriter.Build<cinn::dialect::ReduceSumOp>(
+        op->operand_source(0), axis, keep_dim, dtype);
+    rewriter.ReplaceAllUsesWith(op.result(0), cinn_reduce.result(0));
+    rewriter.EraseOp(op);
+    if (full_int_array_op->use_empty()) {
+      rewriter.EraseOp(full_int_array_op);
+    }
   }
 };
 
-class MinOpPattern : public paddle::drr::DrrPatternBase {
+template <typename SOURCE_OP, typename TARGET_OP>
+class ReduceMinMaxOpPattern : public pir::OpRewritePattern<SOURCE_OP> {
  public:
-  std::string name() const override { return "MinOpPattern"; }
+  using pir::OpRewritePattern<SOURCE_OP>::OpRewritePattern;
 
-  void operator()(paddle::drr::DrrPatternContext *ctx) const override {
-    // Source Pattern
-    paddle::drr::SourcePattern pattern = ctx->SourcePattern();
-    const auto &full_int_array =
-        pattern.Op(paddle::dialect::FullIntArrayOp::name(),
-                   {{"value", pattern.Attr("axis_info")},
-                    {"dtype", pattern.Attr("dtype_2")},
-                    {"place", pattern.Attr("place_2")}});
-
-    const auto &pd_max = pattern.Op(paddle::dialect::MinOp::name(),
-                                    {{"keepdim", pattern.Attr("keep_dim")}});
-    pattern.Tensor("ret") = pd_max(pattern.Tensor("arg0"), full_int_array());
+  bool Match(SOURCE_OP op) const override {
+    if (CompatibleInfo::IsDeniedForCinn(*op.operation())) return false;
+    auto *axes_op = op->operand_source(1).defining_op();
+    return axes_op && axes_op->template isa<paddle::dialect::FullIntArrayOp>();
+  }
 
-    // Result patterns
-    paddle::drr::ResultPattern res = pattern.ResultPattern();
-    const auto &cinn_reduce_max =
-        res.Op(cinn::dialect::ReduceMinOp::name(),
-               {{"dim", pattern.Attr("axis_info")},
-                {"keep_dim", pattern.Attr("keep_dim")}});
-    res.Tensor("ret") = cinn_reduce_max(res.Tensor("arg0"));
+  void Rewrite(SOURCE_OP op, pir::PatternRewriter &rewriter) const override {
+    auto *axes_op = op->operand_source(1).defining_op();
+    auto full_int_array_op =
+        axes_op->template dyn_cast<paddle::dialect::FullIntArrayOp>();
+
+    // get attribute value from full_int_array op
+    const std::vector<int64_t> axis = GetVectorFromIntArrayAttribute<int64_t>(
+        full_int_array_op.attribute("value")
+            .template dyn_cast<pir::ArrayAttribute>());
+    const bool keep_dim = op.attribute("keepdim")
+                              .template dyn_cast<::pir::BoolAttribute>()
+                              .data();
+
+    auto cinn_reduce =
+        rewriter.Build<TARGET_OP>(op->operand_source(0), axis, keep_dim);
+    rewriter.ReplaceAllUsesWith(op.result(0), cinn_reduce.result(0));
+    rewriter.EraseOp(op);
+    if (full_int_array_op->use_empty()) {
+      rewriter.EraseOp(full_int_array_op);
+    }
   }
 };
 
-class ProdOpPattern : public paddle::drr::DrrPatternBase {
+class ProdOpPattern : public pir::OpRewritePattern<paddle::dialect::ProdOp> {
  public:
-  std::string name() const override { return "ProdOpPattern"; }
+  using pir::OpRewritePattern<paddle::dialect::ProdOp>::OpRewritePattern;
 
-  void operator()(paddle::drr::DrrPatternContext *ctx) const override {
-    // Source Pattern
-    paddle::drr::SourcePattern pattern = ctx->SourcePattern();
-    const auto &full_int_array =
-        pattern.Op(paddle::dialect::FullIntArrayOp::name(),
-                   {{"value", pattern.Attr("axis_info")},
-                    {"dtype", pattern.Attr("dtype_2")},
-                    {"place", pattern.Attr("place_2")}});
-
-    const auto &pd_max =
-        pattern.Op(paddle::dialect::ProdOp::name(),
-                   {{"keep_dim", pattern.Attr("keep_dim")},
-                    {"reduce_all", pattern.Attr("reduce_all")}});
-    pattern.Tensor("ret") = pd_max(pattern.Tensor("arg0"), full_int_array());
+  bool Match(paddle::dialect::ProdOp op) const override {
+    if (CompatibleInfo::IsDeniedForCinn(*op.operation())) return false;
+    auto *axes_op = op->operand_source(1).defining_op();
+    return axes_op && axes_op->isa<paddle::dialect::FullIntArrayOp>();
+  }
 
-    // Result patterns
-    paddle::drr::ResultPattern res = pattern.ResultPattern();
-    const auto &cinn_reduce_max =
-        res.Op(cinn::dialect::ReduceProdOp::name(),
-               {{"dim", pattern.Attr("axis_info")},
-                {"keep_dim", pattern.Attr("keep_dim")},
-                {"reduce_all", pattern.Attr("reduce_all")}});
-    res.Tensor("ret") = cinn_reduce_max(res.Tensor("arg0"));
+  void Rewrite(paddle::dialect::ProdOp op,
+               pir::PatternRewriter &rewriter) const override {
+    auto *axes_op = op->operand_source(1).defining_op();
+    auto full_int_array_op =
+        axes_op->dyn_cast<paddle::dialect::FullIntArrayOp>();
+
+    // get attribute value from full_int_array op
+    const std::vector<int64_t> axis = GetVectorFromIntArrayAttribute<int64_t>(
+        full_int_array_op.attribute("value").dyn_cast<pir::ArrayAttribute>());
+    const bool keep_dim =
+        op.attribute("keep_dim").dyn_cast<::pir::BoolAttribute>().data();
+    const bool reduce_all =
+        op.attribute("reduce_all").dyn_cast<::pir::BoolAttribute>().data();
+
+    auto cinn_reduce = rewriter.Build<cinn::dialect::ReduceProdOp>(
+        op->operand_source(0), axis, keep_dim, reduce_all);
+    rewriter.ReplaceAllUsesWith(op.result(0), cinn_reduce.result(0));
+    rewriter.EraseOp(op);
+    if (full_int_array_op->use_empty()) {
+      rewriter.EraseOp(full_int_array_op);
+    }
   }
 };
 
@@ -1117,10 +1129,12 @@ pir::RewritePatternSet PdOpToCinnOpPass::InitializePatterns(
   pir::RewritePatternSet ps(context);
   ps.Add<ScaleOpPattern>(
       context);  // NOTE, scale op pattern should before AddBroadcastTo
-  ps.Add(paddle::drr::Create<SumOpPattern>(context));
-  ps.Add(paddle::drr::Create<MaxOpPattern>(context));
-  ps.Add(paddle::drr::Create<MinOpPattern>(context));
-  ps.Add(paddle::drr::Create<ProdOpPattern>(context));
+  ps.Add<SumOpPattern>(context);
+  ps.Add<ReduceMinMaxOpPattern<paddle::dialect::MinOp,
+                               cinn::dialect::ReduceMinOp>>(context);
+  ps.Add<ReduceMinMaxOpPattern<paddle::dialect::MaxOp,
+                               cinn::dialect::ReduceMaxOp>>(context);
+  ps.Add<ProdOpPattern>(context);
   ps.Add<ReshapeOpPattern>(context);
   ps.Add<PowOpPattern>(context);
   ps.Add<ConcatOpPattern>(context);
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/pir_to_py_code_converter.cc b/paddle/cinn/hlir/dialect/operator/transforms/pir_to_py_code_converter.cc
index 74f3e4b4f200d..234421cf27600 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/pir_to_py_code_converter.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/pir_to_py_code_converter.cc
@@ -866,27 +866,52 @@ struct PirToPyCodeConverterHelper {
   }
 
   std::string ConvertInputTypes(const pir::Operation* op) {
-    std::stringstream ss;
-    ss << "[";
-    for (int i = 0; i < op->num_operands(); ++i) {
-      if (i > 0) {
-        ss << ", ";
+    const auto& VisitValue = [&](const auto& DoEachValue) {
+      for (int i = 0; i < op->num_operands(); ++i) {
+        DoEachValue(op->operand_source(i));
       }
-      ss << ConvertType(op->operand_source(i).type());
-    }
-    ss << "]";
-    return ss.str();
+    };
+    return ConvertValueTypes(VisitValue);
+  }
+
+  std::string ConvertBlockArgTypes(const pir::Block& block) {
+    const auto& VisitValue = [&](const auto& DoEachValue) {
+      for (const auto& arg : block.args()) {
+        DoEachValue(arg);
+      }
+    };
+    return ConvertValueTypes(VisitValue);
+  }
+
+  std::string ConvertBlockKwArgTypes(const pir::Block& block) {
+    const auto& VisitValue = [&](const auto& DoEachValue) {
+      for (const auto& [_, arg] : block.kwargs()) {
+        DoEachValue(arg);
+      }
+    };
+    return ConvertValueTypes(VisitValue);
   }
 
   std::string ConvertOutputTypes(const pir::Operation* op) {
+    const auto& VisitValue = [&](const auto& DoEachValue) {
+      for (int i = 0; i < op->num_results(); ++i) {
+        DoEachValue(op->result(i));
+      }
+    };
+    return ConvertValueTypes(VisitValue);
+  }
+
+  template <typename VisitValueT>
+  std::string ConvertValueTypes(const VisitValueT& VisitValue) {
     std::stringstream ss;
     ss << "[";
-    for (int i = 0; i < op->num_results(); ++i) {
-      if (i > 0) {
+    int i = 0;
+    VisitValue([&](pir::Value value) {
+      if (i++ > 0) {
         ss << ", ";
       }
-      ss << ConvertType(op->result(i).type());
-    }
+      ss << ConvertType(value.type());
+    });
     ss << "]";
     return ss.str();
   }
@@ -1098,7 +1123,45 @@ struct PirToPyCodeConverterHelper {
         }
         ss << "]";
       }
-      ss << "]";
+      ss << "], ";
+    }
+    {
+      int i = 0;
+      ss << "block_positional_arg_types=[";
+      for (const auto& region : *op) {
+        if (i++ > 0) {
+          ss << ",";
+        }
+        int j = 0;
+        ss << "[";
+        for (const auto& block : region) {
+          if (j++ > 0) {
+            ss << ",";
+          }
+          ss << ConvertBlockArgTypes(block);
+        }
+        ss << "]";
+      }
+      ss << "], ";
+    }
+    {
+      int i = 0;
+      ss << "block_keyword_arg_types=[";
+      for (const auto& region : *op) {
+        if (i++ > 0) {
+          ss << ",";
+        }
+        int j = 0;
+        ss << "[";
+        for (const auto& block : region) {
+          if (j++ > 0) {
+            ss << ",";
+          }
+          ss << ConvertBlockKwArgTypes(block);
+        }
+        ss << "]";
+      }
+      ss << "], ";
     }
     return ss.str();
   }
@@ -1138,18 +1201,10 @@ struct PirToPyCodeConverterHelper {
 
   std::string GetPyClassName() {
     std::ostringstream ss;
-    ss << "PirProgram_" << RandomInt();
+    ss << "PirProgram_" << program_->id();
     return ss.str();
   }
 
-  int64_t RandomInt() {
-    std::random_device rd{};
-    std::mt19937_64 gen(rd());
-    std::uniform_int_distribution<int64_t> dis(
-        0, std::numeric_limits<int64_t>::max());
-    return dis(gen);
-  }
-
   std::string ConvertIStringsToString(const IStrings& istrings) {
     std::stringstream ss;
     for (const auto& istring : istrings) {
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/split_generate_shape_into_shape_ops_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/split_generate_shape_into_shape_ops_pass.cc
index 4dd7e3ecf3e7d..98a8ff2e7ec3e 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/split_generate_shape_into_shape_ops_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/split_generate_shape_into_shape_ops_pass.cc
@@ -136,10 +136,10 @@ struct CachedDimExprToValueConverter {
           ->Build<paddle::dialect::FlattenOp>(value, 0, dims.size() - 1)
           .out();
     };
-    if (tensor_dim.value.type()
-            .dyn_cast<paddle::dialect::DenseTensorType>()
-            .dims()
-            .size() == 0) {
+    const auto& ddim = tensor_dim.value.type()
+                           .dyn_cast<paddle::dialect::DenseTensorType>()
+                           .dims();
+    if (ddim.size() == 0 || (ddim.size() == 1 && ddim[0] == 1)) {
       return CastToInt64IfNeed(tensor_dim.value);
     }
     return CastToInt64IfNeed(rewriter
diff --git a/paddle/cinn/hlir/framework/op_lowering_impl_base.h b/paddle/cinn/hlir/framework/op_lowering_impl_base.h
index 4d5284f22f6ed..3711f102dc2e8 100644
--- a/paddle/cinn/hlir/framework/op_lowering_impl_base.h
+++ b/paddle/cinn/hlir/framework/op_lowering_impl_base.h
@@ -31,6 +31,8 @@ struct BucketLoweredFuncsWrapper {
   std::vector<std::pair<ir::SymbolicPredicate, ir::LoweredFunc>>
       predicate2funcs;
   ir::LoweredFunc infer_shape_func;
+  std::vector<std::pair<ir::SymbolicPredicate, ir::LoweredFunc>>
+      predicate2funcsCX86;
 };
 
 template <typename T>
diff --git a/paddle/cinn/hlir/framework/pir/compilation_cache.cc b/paddle/cinn/hlir/framework/pir/compilation_cache.cc
index 1c5322c38866e..86f65bfb5c8db 100644
--- a/paddle/cinn/hlir/framework/pir/compilation_cache.cc
+++ b/paddle/cinn/hlir/framework/pir/compilation_cache.cc
@@ -37,11 +37,22 @@ void* BackendResource::GetInferFuncPtr() const {
   return ptr;
 }
 
+void* BackendResource::GetCX86HostFuncPtr() const {
+  VLOG(4) << "Lookup kernel name: " << host_fn_name_ + "_CX86";
+  void* ptr = backend_compiler_->Lookup(host_fn_name_ + "_CX86");
+  PADDLE_ENFORCE_NOT_NULL(
+      ptr,
+      ::common::errors::InvalidArgument("Can't find kernel function %s",
+                                        host_fn_name_ + "_CX86"));
+  return ptr;
+}
+
 pir::CINNKernelInfo BackendResource::GenerateKernelInfo() const {
   pir::CINNKernelInfo kernel_info;
   kernel_info.fn_name = host_fn_name_;
   kernel_info.fn_ptr = GetHostFuncPtr();
   kernel_info.infer_shape_fn_ptr = GetInferFuncPtr();
+  kernel_info.CX86_fn_ptr = GetCX86HostFuncPtr();
   kernel_info.int_args_map = GetIntArgsMap();
   return kernel_info;
 }
diff --git a/paddle/cinn/hlir/framework/pir/compilation_cache.h b/paddle/cinn/hlir/framework/pir/compilation_cache.h
index 0294755d399ef..f0f6c53380395 100644
--- a/paddle/cinn/hlir/framework/pir/compilation_cache.h
+++ b/paddle/cinn/hlir/framework/pir/compilation_cache.h
@@ -41,6 +41,7 @@ class BackendResource final {
 
   void* GetHostFuncPtr() const;
   void* GetInferFuncPtr() const;
+  void* GetCX86HostFuncPtr() const;
   const std::map<int, CINNKernelInfo::ArgDimIdx>& GetIntArgsMap() const {
     return int_args_map_;
   }
diff --git a/paddle/cinn/hlir/framework/pir/compilation_task.cc b/paddle/cinn/hlir/framework/pir/compilation_task.cc
index 1304979d14a61..39ddcf8291306 100644
--- a/paddle/cinn/hlir/framework/pir/compilation_task.cc
+++ b/paddle/cinn/hlir/framework/pir/compilation_task.cc
@@ -29,6 +29,11 @@ void GroupCompilationContext::SetLoweredFuncs(
     predicates_.push_back(std::move(predicate2func.first));
     lowered_funcs_.push_back(std::move(predicate2func.second));
   }
+  for (std::pair<ir::SymbolicPredicate, ir::LoweredFunc>& predicate2func :
+       funcs.predicate2funcsCX86) {
+    CX86_predicates_.push_back(std::move(predicate2func.first));
+    CX86_lowered_funcs_.push_back(std::move(predicate2func.second));
+  }
   infer_shape_lowered_func_ = std::move(funcs.infer_shape_func);
 }
 
@@ -73,11 +78,24 @@ std::shared_ptr<pir::CompilationResult> CompilationTask::CodegenAndJit() {
   }
   builder.SetInferShapeFunc(context_->infer_shape_lowered_func_);
   ir::Module ir_module = builder.Build();
-  return BuildPirCINNKernelInfo(ir_module);
+
+  ir::Module::Builder builder_CX86(cinn::common::UniqName("module"),
+                                   common::DefaultHostTarget());
+  CHECK_EQ(context_->CX86_predicates_.size(),
+           context_->CX86_lowered_funcs_.size());
+  for (const ir::Expr& predicate : context_->CX86_predicates_) {
+    builder_CX86.AddPredicate(predicate);
+  }
+  for (const ir::LoweredFunc& func : context_->CX86_lowered_funcs_) {
+    builder_CX86.AddFunction(func);
+  }
+  ir::Module ir_moduleCX86 = builder_CX86.Build();
+
+  return BuildPirCINNKernelInfo(ir_module, ir_moduleCX86);
 }
 
 std::shared_ptr<pir::CompilationResult> CompilationTask::BuildPirCINNKernelInfo(
-    const ir::Module& module) {
+    const ir::Module& module, const ir::Module& CX86module) {
   auto compilation_result =
       std::make_shared<pir::CompilationResult>(context_->target_);
   auto backend_resource = std::make_shared<pir::BackendResource>(
@@ -86,7 +104,8 @@ std::shared_ptr<pir::CompilationResult> CompilationTask::BuildPirCINNKernelInfo(
       context_->group_->FuncName() + "_infer_shape",
       context_->group_->int_args_map());
   VLOG(5) << "Start to compile module into cuda kernel...";
-  backend_resource->GetBackendCompiler()->Build(module, "");
+  backend_resource->GetBackendCompiler()->Build(module, "", false);
+  backend_resource->GetBackendCompiler()->AppendCX86(CX86module);
   compilation_result->SetBackendResource(backend_resource);
   VLOG(5) << "End to compile module into cuda kernel.";
   return compilation_result;
diff --git a/paddle/cinn/hlir/framework/pir/compilation_task.h b/paddle/cinn/hlir/framework/pir/compilation_task.h
index d104d264b6852..1ed3e2d5e6217 100644
--- a/paddle/cinn/hlir/framework/pir/compilation_task.h
+++ b/paddle/cinn/hlir/framework/pir/compilation_task.h
@@ -42,6 +42,8 @@ class GroupCompilationContext {
   const pir::OpLoweringGroupPtr& group_;
   std::vector<ir::SymbolicPredicate> predicates_;
   std::vector<ir::LoweredFunc> lowered_funcs_;
+  std::vector<ir::SymbolicPredicate> CX86_predicates_;
+  std::vector<ir::LoweredFunc> CX86_lowered_funcs_;
   ir::LoweredFunc infer_shape_lowered_func_;
 };
 
@@ -56,7 +58,7 @@ class CompilationTask {
   void Lowering();
   std::shared_ptr<pir::CompilationResult> CodegenAndJit();
   std::shared_ptr<pir::CompilationResult> BuildPirCINNKernelInfo(
-      const ir::Module& module);
+      const ir::Module& module, const ir::Module& CX86module);
 
   GroupCompilationContext* context_;
 };
diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_group.cc b/paddle/cinn/hlir/framework/pir/op_lowering_group.cc
index e5187f47ab471..e23ec953431c0 100644
--- a/paddle/cinn/hlir/framework/pir/op_lowering_group.cc
+++ b/paddle/cinn/hlir/framework/pir/op_lowering_group.cc
@@ -145,8 +145,9 @@ std::shared_ptr<OpLoweringGroup> OpLoweringGroup::Clone(
     ops_mapper[op] = new_op;
   }
 
+  const auto new_fn_name = this->fn_name_ + "_cloned";
   // Construct Base information for new Group
-  auto new_group = std::make_shared<OpLoweringGroup>(new_ops);
+  auto new_group = std::make_shared<OpLoweringGroup>(new_ops, new_fn_name);
   for (auto* op : this->output_ops_) {
     new_group->output_ops_.insert(ops_mapper.at(op));
   }
diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_group.h b/paddle/cinn/hlir/framework/pir/op_lowering_group.h
index 935e759ed2331..7595985d4d5b9 100644
--- a/paddle/cinn/hlir/framework/pir/op_lowering_group.h
+++ b/paddle/cinn/hlir/framework/pir/op_lowering_group.h
@@ -42,15 +42,13 @@ class OpLoweringGroup {
   OpLoweringGroup(const OpLoweringGroup&) = delete;
   OpLoweringGroup(OpLoweringGroup&&) = delete;
 
-  explicit OpLoweringGroup(const std::vector<::pir::Operation*>& group_ops)
-      : ops_(group_ops) {
-    fn_name_ = CompatibleInfo::GroupOpsName(ops_);
-  }
+  explicit OpLoweringGroup(const std::vector<::pir::Operation*>& group_ops,
+                           const std::string& fn_name)
+      : ops_(group_ops), fn_name_(fn_name) {}
 
-  explicit OpLoweringGroup(std::initializer_list<::pir::Operation*> group_ops)
-      : ops_(group_ops) {
-    fn_name_ = CompatibleInfo::GroupOpsName(ops_);
-  }
+  explicit OpLoweringGroup(std::initializer_list<::pir::Operation*> group_ops,
+                           const std::string& fn_name)
+      : ops_(group_ops), fn_name_(fn_name) {}
 
   const std::string& FuncName() const { return this->fn_name_; }
   ::pir::Block* GetParentBlock() const;
diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc b/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc
index 8ba8753a84eaf..4c4362aec935d 100644
--- a/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc
+++ b/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc
@@ -20,6 +20,7 @@
 #include "paddle/cinn/ast_gen_ius/tensor_group.h"
 #include "paddle/cinn/backends/codegen_device_util.h"
 #include "paddle/cinn/common/dim_expr_converter.h"
+#include "paddle/cinn/common/target.h"
 #include "paddle/cinn/hlir/dialect/operator/ir/manual_op.h"
 #include "paddle/cinn/hlir/dialect/operator/transforms/group_merge/op_with_group_merge_util.h"
 #include "paddle/cinn/hlir/framework/compile_error.h"
@@ -124,19 +125,9 @@ std::shared_ptr<GroupInfo> OpLowererImpl::GetGroupInfo(
     }
   }
 
-  BuildBroadcastInfo(group, group_info);
-
   for (auto& op : group->output_ops()) {
     group_info->direct_output_var_names.insert(ValueName(op->result(0)));
     // collect all output tensor.
-    if (op->name() == "cinn_op.yield_store") {
-      auto input_var_name = ValueName(op->operand_source(0));
-      if (group_info->broadcast_info.count(input_var_name)) {
-        auto base_info = group_info->broadcast_info[input_var_name];
-        base_info.with_constrain = true;
-        group_info->broadcast_info[ValueName(op->result(0))] = base_info;
-      }
-    }
     for (auto opresult : op->results()) {
       if (tensor_map.count(opresult) == 0) {
         continue;
@@ -146,13 +137,7 @@ std::shared_ptr<GroupInfo> OpLowererImpl::GetGroupInfo(
   }
 
   for (const auto& val : group->output_values()) {
-    if (val.defining_op()->name() == "cinn_op.reshape" &&
-        erase_reshape.count(val.defining_op())) {
-      group_info->direct_output_var_names.insert(
-          ValueName(val.defining_op()->operand_source(0)));
-    } else {
-      group_info->direct_output_var_names.insert(ValueName(val));
-    }
+    group_info->direct_output_var_names.insert(ValueName(val));
   }
   return group_info;
 }
@@ -207,6 +192,8 @@ BucketLoweredFuncsWrapper OpLowererImpl::BucketLower(
   if (ops.size() == 1 && ops[0]->name() == "custom_call") {
     return {{{ir::Expr(1), LowerCustomCall(group)[0]}}, ir::LoweredFunc()};
   }
+  auto X86Expr = LowerX86(group, ops, apply_op_schedule);
+  VLOG(3) << "After x86 lower, ir is: \n" << X86Expr;
 
   std::vector<ir::Tensor> group_func_arg_tensors;
   std::unordered_map<::pir::Value, ir::Tensor> tensor_map;
@@ -272,6 +259,9 @@ BucketLoweredFuncsWrapper OpLowererImpl::BucketLower(
                                   ir_sch.GetModule().GetExprs()[0]);
   }
 
+  // The last func is stored as a kernel on x86
+  cond2func_bodies.emplace_back(ir::Expr(true), X86Expr);
+
   // 3.Do post-processing,
   // including preparing function args and temporary variables,
   // applying low-level optimization passes, etc.
@@ -296,10 +286,16 @@ BucketLoweredFuncsWrapper OpLowererImpl::BucketLower(
                         "The size of funcs and cond2func_bodies should be "
                         "the same."));
   BucketLoweredFuncsWrapper funcs_wrapper;
-  for (int i = 0; i < funcs.size(); ++i) {
+  for (int i = 0; i < funcs.size() - 1; ++i) {
     funcs_wrapper.predicate2funcs.emplace_back(cond2func_bodies[i].first,
                                                funcs[i]);
   }
+  // The last func is x86 kernel.
+  for (size_t i = funcs.size() - 1; i < funcs.size(); ++i) {
+    funcs[i]->name = funcs[i]->name + "_CX86";
+    funcs_wrapper.predicate2funcsCX86.emplace_back(cond2func_bodies[i].first,
+                                                   funcs[i]);
+  }
   funcs_wrapper.infer_shape_func =
       GenerateInferShapeFunc(group, infer_shape_tensor_args, group_func_args);
 
@@ -514,159 +510,6 @@ std::vector<ir::LoweredFunc> OpLowererImpl::LowerGroup(
                      &infer_shape_args);
 }
 
-void OpLowererImpl::BuildBroadcastInfo(const OpLoweringGroupPtr& group,
-                                       std::shared_ptr<GroupInfo> group_info) {
-  // TODO(phlrain): this is primary verion for loop aligment
-  // will be update by a new method
-  auto& align_info = group->mut_alignment_schedule_info();
-
-  auto& ops = group->ops();
-  for (auto op1 : ops) {
-    auto it = align_info.find(op1);
-    if (it == align_info.end()) {
-      continue;
-    }
-    if (op1->name() == "cinn_op.generate_shape") {
-      continue;
-    }
-
-    if (it->second.size() > 1) {
-      for (size_t i = 0; i < it->second.size(); ++i) {
-      }
-      // TODO(phlran): merge to factor info here
-      it->second.front().factor_info = it->second.back().factor_info;
-      it->second.resize(1);
-    }
-
-    PADDLE_ENFORCE_EQ(
-        it->second.size(),
-        1,
-        phi::errors::Unimplemented("%s, only suppopt one transform yet",
-                                   it->first->name()));
-
-    if (it->second[0].type == ScheduleAlignType::kBroadcast) {
-      // get broadcast op
-      auto broadcast_axes = it->second[0].axis_info;
-      auto output_shape = it->second[0].factor_info;
-
-      phi::DDim in_dim;
-
-      if (it->first->name() == "cinn_op.reshape") {
-        // TODO(phlrain): deal with reshape in a better way
-        if (it->first->result(0).use_count() == 1 &&
-            it->first->result(0).first_use().owner()->isa<::pir::YieldOp>()) {
-          continue;
-        }
-      }
-
-      if ((it->first->name() != "cinn_op.reshape") &&
-          (it->first->name() != "cinn_op.broadcast") &&
-          (it->first->num_operands() == 1)) {
-        in_dim = it->first->operand_source(0)
-                     .type()
-                     .dyn_cast<paddle::dialect::DenseTensorType>()
-                     .dims();
-      } else {
-        in_dim = it->first->result(0)
-                     .type()
-                     .dyn_cast<paddle::dialect::DenseTensorType>()
-                     .dims();
-      }
-
-      cinn::ir::BroadcastInfo info;
-      if (in_dim.size() == 1u && in_dim[0] == 1u) {
-        info.full_broadcast = true;
-        for (size_t i = 0; i < output_shape.size(); ++i) {
-          info.broadcast_axes.push_back(i);
-          info.output_shape.push_back(-1);
-          info.output_dim_expr.push_back(group->loop_ranges_expr()[i]);
-        }
-      } else if (in_dim.size() == broadcast_axes.size()) {
-        if (in_dim.size() != output_shape.size()) {
-          info.split_first = true;
-
-          if (broadcast_axes.size() == 1) {
-            std::vector<int> temp_shape(output_shape.size(), 1);
-            temp_shape[broadcast_axes[0]] = output_shape[broadcast_axes[0]];
-            info.split_info.emplace_back(0, temp_shape);
-
-            for (size_t i = 0; i < output_shape.size(); ++i) {
-              if (i != broadcast_axes[0]) {
-                info.broadcast_axes.push_back(i);
-                info.output_shape.push_back(output_shape[i]);
-              }
-            }
-          } else {
-            throw std::runtime_error("not support multi dim broadcast yet");
-          }
-        } else {
-          for (size_t i = 0; i < broadcast_axes.size(); ++i) {
-            if (in_dim[i] < 0 || output_shape[broadcast_axes[i]] < 0) {
-              continue;
-            }
-            if (in_dim[i] != output_shape[broadcast_axes[i]]) {
-              if (in_dim[i] != 1) {
-                throw std::runtime_error("Only support 1 - D broadcast ");
-              }
-              info.broadcast_axes.push_back(i);
-              info.output_shape.push_back(output_shape[broadcast_axes[i]]);
-            }
-          }
-        }
-      } else {
-        // only deal with broadcast axes
-        std::set<int> axes_set;
-        for (size_t i = 0; i < broadcast_axes.size(); ++i) {
-          axes_set.insert(broadcast_axes[i]);
-          if (in_dim[broadcast_axes[i]] != 1) {
-            throw std::runtime_error("Only support 1 - D broadcast ");
-          }
-
-          info.broadcast_axes.push_back(broadcast_axes[i]);
-          info.output_shape.push_back(output_shape[broadcast_axes[i]]);
-        }
-      }
-
-      for (size_t i = 0; i < it->first->num_operands(); ++i) {
-        if (!align_info.count(it->first->operand_source(i).defining_op())) {
-          info.first_broadcast = true;
-          break;
-        }
-      }
-
-      auto op_out = it->first->result(0);
-      info.op_name = it->first->name();
-
-      if (op_out.use_count() == 1 &&
-          op_out.first_use().owner()->name() == "cf.yield") {
-        info.with_constrain = true;
-      }
-
-      if (erase_reshape.count(op_out.first_use().owner())) {
-        info.with_constrain = true;
-      }
-
-      group_info->broadcast_info[ValueName(op_out)] = info;
-
-      for (auto use_it = op_out.use_begin(); use_it != op_out.use_end();
-           ++use_it) {
-        if (use_it->owner()->name() == "cf.yield") {
-          continue;
-        }
-        if (CompatibleInfo::OpKind(*(use_it->owner())) ==
-            framework::kBroadcast) {
-          if (!info.full_broadcast) {
-            group_info->broadcast_to_elementwise[ValueName(
-                use_it->owner()->result(0))] = info;
-          }
-        }
-      }
-    } else {
-      throw std::runtime_error("only supportbroadcast type for now");
-    }
-  }
-}
-
 std::vector<ir::LoweredFunc> OpLowererImpl::LowerCustomCall(
     const OpLoweringGroupPtr& group) {
   const auto& ops = group->ops();
@@ -777,10 +620,6 @@ std::vector<ir::LoweredFunc> OpLowererImpl::PostProcess(
       }
     }
     infer_shape_arg_tensor->push_back(tensor);
-    if ((op_result.defining_op()->name() == "cinn_op.reshape") &&
-        erase_reshape.count(op_result.defining_op())) {
-      tensor = tensor_map.at(op_result.defining_op()->operand_source(0));
-    }
 
     if (arg_name_set.count(tensor->buffer->name) != 0) {
       continue;
@@ -846,18 +685,21 @@ std::vector<ir::LoweredFunc> OpLowererImpl::PostProcess(
     }
   }
   std::vector<ir::LoweredFunc> lowered_funcs;
-  for (ir::Expr func_body : func_bodies) {
+  for (int i = 0; i < func_bodies.size(); ++i) {
+    ir::Expr func_body = func_bodies[i];
     optim::EliminateDeadScheduleBlock(&(func_body), group->output_names());
-    cinn::common::DefaultDeviceTarget().arch.Match(
-        [&](std::variant<common::UnknownArch,
-                         common::X86Arch,
-                         common::ARMArch>) {},
-        [&](common::NVGPUArch) {
+    if (i != func_bodies.size() - 1) {
+      cinn::common::DefaultDeviceTarget().arch.Match(
+          [&](std::variant<common::UnknownArch,
+                           common::X86Arch,
+                           common::ARMArch>) {},
+          [&](common::NVGPUArch) {
 #ifdef CINN_WITH_CUDA
-          optim::EliminateCommonGlobalMemoryRead(&(func_body));
-          optim::OptimizeExprGPU(&(func_body));
+            optim::EliminateCommonGlobalMemoryRead(&(func_body));
+            optim::OptimizeExprGPU(&(func_body));
 #endif
-        });
+          });
+    }
 
     // 2.Prepare temp buffers
     auto temp_buffers =
@@ -869,8 +711,13 @@ std::vector<ir::LoweredFunc> OpLowererImpl::PostProcess(
       func->PrepareBufferCastExprs();
     }
     // 4.Apply low level pass
-    func = optim::Optimize(Expr(func), target_, false).as_lowered_func_ref();
-    optim::RearrangeLoadInstruction(&(func->body));
+    if (i != func_bodies.size() - 1) {
+      func = optim::Optimize(Expr(func), target_, false).as_lowered_func_ref();
+      optim::RearrangeLoadInstruction(&(func->body));
+    } else {
+      func = optim::Optimize(Expr(func), common::DefaultHostTarget(), false)
+                 .as_lowered_func_ref();
+    }
     lowered_funcs.push_back(std::move(func));
   }
 
@@ -1327,6 +1174,73 @@ ir::LoweredFunc OpLowererImpl::GenerateInferShapeFunc(
                               {});
   return infer_shape_func;
 }
+ir::Expr OpLowererImpl::LowerX86(const OpLoweringGroupPtr& group,
+                                 const std::vector<::pir::Operation*>& ops,
+                                 bool apply_op_schedule) {
+  std::vector<ir::Tensor> group_func_arg_tensors;
+  std::unordered_map<::pir::Value, ir::Tensor> tensor_map;
+  // for some op, it will output more tmp value and regard as
+  // XX_0, XX_1, so we log them in tmp_tensor_info;
+  std::unordered_map<std::string, ir::Tensor> tmp_tensor_info;
+
+  auto need_lower_x86 = [&]() -> bool {
+    for (auto* op : ops) {
+      for (size_t i = 0; i < op->num_operands(); ++i) {
+        auto in = op->operand_source(i);
+        auto type_info = in.type().dyn_cast<paddle::dialect::DenseTensorType>();
+        auto dtype = type_info.dtype();
+        const auto& dims = type_info.dims();
+        std::vector<ir::Dim> sym_shape;
+        // 1. dynamic shape not need lower x86
+        if (::common::contain_unknown_dim(dims)) {
+          return false;
+        }
+        // 2. size < 4 not need lower x86
+        int64_t sym_shape_size = 1;
+        for (int i = 0; i < dims.size(); ++i) {
+          sym_shape_size *= dims[i];
+          if (sym_shape_size > 4) {
+            return false;
+          }
+        }
+      }
+
+      std::vector<Type> out_types;
+      std::vector<std::vector<ir::Dim>> out_shapes;
+      CollectOutputInfo(op, &out_types, &out_shapes, group);
+      for (const auto& tt : out_types) {
+        // 3. float16 not need lower x86
+        if (tt.is_float16()) {
+          return false;
+        }
+      }
+    }
+    return true;
+  };
+  if (!need_lower_x86()) {
+    return ir::Expr(-1);
+  }
+
+  this->target_ = common::DefaultHostTarget();
+  cinn::runtime::CurrentTarget::SetCurrentTarget(this->target_);
+
+  std::vector<ir::Expr> func_bodies =
+      LowerOps(group,
+               ops,
+               apply_op_schedule,
+               &OpLowererImpl::DyShapeScheduleDetermineFunction,
+               &group_func_arg_tensors,
+               &tensor_map,
+               &tmp_tensor_info);
+  this->target_ = common::DefaultNVGPUTarget();
+  cinn::runtime::CurrentTarget::SetCurrentTarget(this->target_);
+  ir::ModuleExpr mod_expr(func_bodies);
+  ir::IRSchedule ir_sch(
+      mod_expr, -1, false, cinn::utils::ErrorMessageLevel::kGeneral, true);
+  ir_sch.MergeExprs();
+  auto X86Expr = ir::ir_utils::IRCopy(ir_sch.GetModule().GetExprs().at(0));
+  return X86Expr;
+}
 
 }  // namespace pir
 }  // namespace framework
diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_impl.h b/paddle/cinn/hlir/framework/pir/op_lowering_impl.h
index 838b70da20fa5..9edb88ec3e431 100644
--- a/paddle/cinn/hlir/framework/pir/op_lowering_impl.h
+++ b/paddle/cinn/hlir/framework/pir/op_lowering_impl.h
@@ -57,10 +57,6 @@ struct GroupInfo {
   std::set<std::string> shared_var_names;
   std::set<std::string> direct_output_var_names;
   std::vector<std::string> broadcast_output_names;
-
-  std::unordered_map<std::string, cinn::ir::BroadcastInfo> broadcast_info;
-  std::unordered_map<std::string, cinn::ir::BroadcastInfo>
-      broadcast_to_elementwise;
 };
 
 class OpLowererImpl : public OpLowererImplBase<OpLoweringGroupPtr> {
@@ -296,12 +292,11 @@ class OpLowererImpl : public OpLowererImplBase<OpLoweringGroupPtr> {
 
   void BuildBroadcastInfo(const OpLoweringGroupPtr& group,
                           std::shared_ptr<GroupInfo> group_info);
-
   Target target_;
-
+  ir::Expr LowerX86(const OpLoweringGroupPtr& group,
+                    const std::vector<::pir::Operation*>& ops,
+                    bool apply_op_schedule);
   PrettyNamer* name_gene_;
-
-  std::unordered_set<::pir::Operation*> erase_reshape;
 };
 
 }  // namespace pir
diff --git a/paddle/cinn/hlir/framework/pir/utils.h b/paddle/cinn/hlir/framework/pir/utils.h
index c489e1847f26f..e3e4e8163cfb9 100644
--- a/paddle/cinn/hlir/framework/pir/utils.h
+++ b/paddle/cinn/hlir/framework/pir/utils.h
@@ -33,6 +33,7 @@ struct CINNKernelInfo {
   std::string fn_name;
   void* fn_ptr;
   void* infer_shape_fn_ptr;
+  void* CX86_fn_ptr;
 
   struct ArgDimIdx {
     int arg_idx;
diff --git a/paddle/cinn/hlir/pass/alterlayout.cc b/paddle/cinn/hlir/pass/alterlayout.cc
index 74c8c0915e0af..a747c57dd77af 100644
--- a/paddle/cinn/hlir/pass/alterlayout.cc
+++ b/paddle/cinn/hlir/pass/alterlayout.cc
@@ -20,7 +20,7 @@
 #include "paddle/cinn/hlir/pe/schedule.h"
 #include "paddle/cinn/ir/layout.h"
 #include "paddle/cinn/utils/string.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace hlir {
 namespace pass {
@@ -119,10 +119,26 @@ std::vector<framework::shape_t> UpdateInferInfos(
   CHECK(!infertypes.empty()) << node->op()->name << " finds no infertype";
   CHECK(!inferlayouts.empty()) << node->op()->name << " finds no inferlayout";
   auto outlinks = node->outlinks_in_order();
-  CHECK_EQ(infershapes.size(), infertypes.size());
-  CHECK_EQ(inferlayouts.size(), 2U);
-  CHECK_EQ(infertypes.size(), inferlayouts[0].size());
-  CHECK_EQ(outlinks.size(), infershapes.size());
+  PADDLE_ENFORCE_EQ(
+      infershapes.size(),
+      infertypes.size(),
+      phi::errors::InvalidArgument(
+          "The size of infershapes and infertypes should be equal"));
+  PADDLE_ENFORCE_EQ(inferlayouts.size(),
+                    2U,
+                    phi::errors::InvalidArgument(
+                        "The size of inferlayouts should be 2, but got %d",
+                        inferlayouts.size()));
+  PADDLE_ENFORCE_EQ(
+      infertypes.size(),
+      inferlayouts[0].size(),
+      phi::errors::InvalidArgument(
+          "The size of infertypes and inferlayouts[0] should be equal"));
+  PADDLE_ENFORCE_EQ(
+      outlinks.size(),
+      infershapes.size(),
+      phi::errors::InvalidArgument(
+          "The size of outlinks and infershapes should be equal"));
 
   for (int i = 0; i < outlinks.size(); i++) {
     auto* sink = outlinks[i]->sink();
@@ -181,7 +197,11 @@ void AlterLayoutPass(Graph* graph) {
               node->attrs.attr_store.at("dilation"));
         }
         const auto& conv_inlinks = node->inlinks_in_order();
-        CHECK_EQ(conv_inlinks.size(), 2U) << "conv2d should have 2 inputs";
+        PADDLE_ENFORCE_EQ(conv_inlinks.size(),
+                          2U,
+                          phi::errors::InvalidArgument(
+                              "conv2d should have 2 inputs, but got %d",
+                              conv_inlinks.size()));
         std::vector<std::vector<int>> inputs_shape;
         for (auto& link : conv_inlinks) {
           auto* source = link->source();
@@ -231,8 +251,11 @@ void AlterLayoutPass(Graph* graph) {
             input_nodes.push_back(source);
           }
           // get new layout: ic_bn, oc_bn
-          CHECK_EQ(input_nodes.size(), 2U)
-              << "conv2d should have 2 input nodes";
+          PADDLE_ENFORCE_EQ(input_nodes.size(),
+                            2U,
+                            phi::errors::InvalidArgument(
+                                "conv2d should have 2 input nodes, but got %d",
+                                input_nodes.size()));
           auto* input_node = input_nodes[0];
           auto* weight_node = input_nodes[1];
           CHECK(shape_dict.count(input_node->id()))
@@ -347,8 +370,11 @@ void AlterLayoutPass(Graph* graph) {
             conv2d_NCHWc_inputtypes.push_back(trans_out_dtypes);
             conv2d_NCHWc_inputlayouts.push_back(dst_input_layout);
           } else {
-            CHECK_EQ(input_shape.size(), 5U)
-                << "conv2d_NCHWc op's input shape dim should be 5";
+            PADDLE_ENFORCE_EQ(
+                input_shape.size(),
+                5U,
+                phi::errors::InvalidArgument(
+                    "conv2d_NCHWc op's input shape dim should be 5"));
             conv2d_NCHWc_inputshapes.push_back(input_shape);
             conv2d_NCHWc_inputtypes.push_back(input_type);
             CHECK(layout_dict.count(input_node->id()))
@@ -395,8 +421,11 @@ void AlterLayoutPass(Graph* graph) {
             conv2d_NCHWc_inputtypes.push_back(trans_out_dtypes);
             conv2d_NCHWc_inputlayouts.push_back(dst_kernel_layout);
           } else {
-            CHECK_EQ(weight_shape.size(), 6U)
-                << weight_node->id() << " shape dim should be 6";
+            PADDLE_ENFORCE_EQ(
+                weight_shape.size(),
+                6U,
+                phi::errors::InvalidArgument(
+                    "conv2d_NCHWc op's weight shape dim should be 6"));
             conv2d_NCHWc_inputshapes.push_back(weight_shape);
             conv2d_NCHWc_inputtypes.push_back(weight_type);
             CHECK(layout_dict.count(weight_node->id()))
@@ -477,12 +506,29 @@ void AlterLayoutPass(Graph* graph) {
               input_shapes, input_layouts, node->attrs, graph->target_);
           // if input inferred layouts is different from original's, expand dims
           // or do transformation.
-          CHECK_EQ(inferlayouts.size(), 2U);
+          PADDLE_ENFORCE_EQ(
+              inferlayouts.size(),
+              2U,
+              phi::errors::InvalidArgument(
+                  "The size of inferlayouts should be 2, but got %d",
+                  inferlayouts.size()));
           auto new_input_layouts = inferlayouts[1];
           auto inlinks = node->inlinks_in_order();
-          CHECK_EQ(input_layouts.size(), inlinks.size());
-          CHECK_EQ(input_layouts.size(), new_input_layouts.size());
-          CHECK_EQ(input_layouts.size(), input_shapes.size());
+          PADDLE_ENFORCE_EQ(
+              input_layouts.size(),
+              inlinks.size(),
+              phi::errors::InvalidArgument(
+                  "The size of input_layouts and inlinks should be equal"));
+          PADDLE_ENFORCE_EQ(input_layouts.size(),
+                            new_input_layouts.size(),
+                            phi::errors::InvalidArgument(
+                                "The size of input_layouts and "
+                                "new_input_layouts should be equal"));
+          PADDLE_ENFORCE_EQ(
+              input_layouts.size(),
+              input_shapes.size(),
+              phi::errors::InvalidArgument("The size of input_layouts and "
+                                           "input_shapes should be equal"));
           bool reset_axis = false;
           for (int i = 0; i < inlinks.size(); i++) {
             if (input_layouts[i] != new_input_layouts[i]) {
diff --git a/paddle/cinn/hlir/pass/check_fusion_accuracy_pass.cc b/paddle/cinn/hlir/pass/check_fusion_accuracy_pass.cc
index 0326a4a5fce33..c0bccf285c730 100644
--- a/paddle/cinn/hlir/pass/check_fusion_accuracy_pass.cc
+++ b/paddle/cinn/hlir/pass/check_fusion_accuracy_pass.cc
@@ -27,7 +27,7 @@
 #include "paddle/cinn/hlir/framework/visualize_helper.h"
 #include "paddle/cinn/hlir/pass/fusion_helper_base.h"
 #include "paddle/cinn/runtime/custom_function.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn::hlir::pass {
 
 using framework::Graph;
@@ -529,8 +529,10 @@ std::vector<Node*> CheckFusionAccuracyPass::TopologicalOrder(
     }
   }
 
-  CHECK_EQ(ordered_nodes.size(), nodes.size())
-      << "There has circle in group! Please check.";
+  PADDLE_ENFORCE_EQ(
+      ordered_nodes.size(),
+      nodes.size(),
+      phi::errors::InvalidArgument("There has circle in group! Please check."));
 
   return ordered_nodes;
 }
diff --git a/paddle/cinn/hlir/pass/check_fusion_accuracy_pass_test.cc b/paddle/cinn/hlir/pass/check_fusion_accuracy_pass_test.cc
index 10f5c83e6600d..447da47e147dc 100644
--- a/paddle/cinn/hlir/pass/check_fusion_accuracy_pass_test.cc
+++ b/paddle/cinn/hlir/pass/check_fusion_accuracy_pass_test.cc
@@ -19,7 +19,7 @@
 
 #include "paddle/cinn/common/target.h"
 #include "paddle/cinn/frontend/decomposer/test_helper.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn::frontend {
 
 using hlir::framework::Graph;
@@ -96,7 +96,11 @@ TEST(CheckFusionAccuracyPass, ElementWise_Fusion) {
   VLOG(1) << "After CheckFusionAccuracyPass:\n"
           << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
 
-  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    group_size_after,
+                    phi::errors::InvalidArgument(
+                        "The number of fusion groups is not equal to the "
+                        "number of groups after the pass."));
 
   RunTest(target, graph, {"A", "B", "C", "D"});
 }
@@ -134,7 +138,11 @@ TEST(CheckFusionAccuracyPass, ElementWise_Fusion_1) {
   VLOG(1) << "After CheckFusionAccuracyPass:\n"
           << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
 
-  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    group_size_after,
+                    phi::errors::InvalidArgument(
+                        "The number of fusion groups is not equal to the "
+                        "number of groups after the pass."));
 
   RunTest(target, graph, {"A", "B", "C", "D"});
 }
@@ -175,7 +183,11 @@ TEST(CheckFusionAccuracyPass, ElementWise_Fusion_2) {
   VLOG(1) << "After CheckFusionAccuracyPass:\n"
           << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
 
-  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    group_size_after,
+                    phi::errors::InvalidArgument(
+                        "The number of fusion groups is not equal to the "
+                        "number of groups after the pass."));
 
   RunTest(target, graph, {"A", "B", "C", "D", "E", "F"});
 }
@@ -216,7 +228,11 @@ TEST(CheckFusionAccuracyPass, ElementWise_Fusion_3) {
   VLOG(1) << "After CheckFusionAccuracyPass:\n"
           << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
 
-  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    group_size_after,
+                    phi::errors::InvalidArgument(
+                        "The number of fusion groups is not equal to the "
+                        "number of groups after the pass."));
 
   RunTest(target, graph, {"A", "B", "C", "D", "E", "F"});
 }
@@ -257,7 +273,11 @@ TEST(CheckFusionAccuracyPass, ElementWise_Fusion_4) {
   VLOG(1) << "After CheckFusionAccuracyPass:\n"
           << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
 
-  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    group_size_after,
+                    phi::errors::InvalidArgument(
+                        "The number of fusion groups is not equal to the "
+                        "number of groups after the pass."));
 
   RunTest(target, graph, {"A", "B", "C", "D", "E", "F"});
 }
@@ -291,7 +311,11 @@ TEST(CheckFusionAccuracyPass, ElementWise_Fusion_5) {
   VLOG(1) << "After CheckFusionAccuracyPass:\n"
           << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
 
-  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    group_size_after,
+                    phi::errors::InvalidArgument(
+                        "The number of fusion groups is not equal to the "
+                        "number of groups after the pass."));
 
   RunTest(target, graph, {"A", "B"});
 }
@@ -328,7 +352,11 @@ TEST(CheckFusionAccuracyPass, Broadcast_Test_0) {
   VLOG(1) << "After CheckFusionAccuracyPass:\n"
           << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
 
-  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    group_size_after,
+                    phi::errors::InvalidArgument(
+                        "The number of fusion groups is not equal to the "
+                        "number of groups after the pass."));
 
   RunTest(target, graph, {"A", "B", "C", "D"});
 }
@@ -365,7 +393,11 @@ TEST(CheckFusionAccuracyPass, Broadcast_Test_2) {
   VLOG(1) << "After CheckFusionAccuracyPass:\n"
           << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
 
-  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    group_size_after,
+                    phi::errors::InvalidArgument(
+                        "The number of fusion groups is not equal to the "
+                        "number of groups after the pass."));
 
   RunTest(target, graph, {"A", "B", "C", "D"});
 }
@@ -404,7 +436,11 @@ TEST(CheckFusionAccuracyPass, Broadcast_Test_4) {
   VLOG(1) << "After CheckFusionAccuracyPass:\n"
           << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
 
-  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    group_size_after,
+                    phi::errors::InvalidArgument(
+                        "The number of fusion groups is not equal to the "
+                        "number of groups after the pass."));
 
   RunTest(target, graph, {"A", "B", "C", "D", "E"});
 }
@@ -443,7 +479,11 @@ TEST(CheckFusionAccuracyPass, Broadcast_Test_5) {
   VLOG(1) << "After CheckFusionAccuracyPass:\n"
           << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
 
-  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    group_size_after,
+                    phi::errors::InvalidArgument(
+                        "The number of fusion groups is not equal to the "
+                        "number of groups after the pass."));
 
   RunTest(target, graph, {"A", "B", "C", "D", "E"});
 }
@@ -479,7 +519,11 @@ TEST(CheckFusionAccuracyPass, Reduce_Test_0) {
   VLOG(1) << "After CheckFusionAccuracyPass:\n"
           << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
 
-  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    group_size_after,
+                    phi::errors::InvalidArgument(
+                        "The number of fusion groups is not equal to the "
+                        "number of groups after the pass."));
 
   RunTest(target, graph, {"A", "B"});
 }
@@ -514,7 +558,11 @@ TEST(CheckFusionAccuracyPass, Reduce_Test_1) {
   VLOG(1) << "After CheckFusionAccuracyPass:\n"
           << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
 
-  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    group_size_after,
+                    phi::errors::InvalidArgument(
+                        "The number of fusion groups is not equal to the "
+                        "number of groups after the pass."));
 
   RunTest(target, graph, {"A", "B"});
 }
@@ -552,7 +600,11 @@ TEST(CheckFusionAccuracyPass, Reduce_Test_2) {
   VLOG(1) << "After CheckFusionAccuracyPass:\n"
           << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
 
-  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    group_size_after,
+                    phi::errors::InvalidArgument(
+                        "The number of fusion groups is not equal to the "
+                        "number of groups after the pass."));
 
   RunTest(target, graph, {"A", "B", "C"});
 }
@@ -590,7 +642,11 @@ TEST(CheckFusionAccuracyPass, Reduce_Test_3) {
   VLOG(1) << "After CheckFusionAccuracyPass:\n"
           << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
 
-  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    group_size_after,
+                    phi::errors::InvalidArgument(
+                        "The number of fusion groups is not equal to the "
+                        "number of groups after the pass."));
 
   RunTest(target, graph, {"A", "B", "C", "D"});
 }
@@ -629,7 +685,11 @@ TEST(CheckFusionAccuracyPass, Reduce_Test_4) {
   VLOG(1) << "After CheckFusionAccuracyPass:\n"
           << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
 
-  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    group_size_after,
+                    phi::errors::InvalidArgument(
+                        "The number of fusion groups is not equal to the "
+                        "number of groups after the pass."));
 
   RunTest(target, graph, {"A", "B", "C", "D"});
 }
@@ -665,7 +725,11 @@ TEST(CheckFusionAccuracyPass, Reduce_Test_5) {
   VLOG(1) << "After CheckFusionAccuracyPass:\n"
           << graph->DebugGroupedGraph(std::unordered_set<std::string>{});
 
-  CHECK_EQ(graph->fusion_groups.size(), group_size_after);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    group_size_after,
+                    phi::errors::InvalidArgument(
+                        "The number of fusion groups is not equal to the "
+                        "number of groups after the pass."));
 
   RunTest(target, graph, {"A", "B"});
 }
diff --git a/paddle/cinn/hlir/pass/constant_folding_pass_util.cc b/paddle/cinn/hlir/pass/constant_folding_pass_util.cc
index 748948f2206fc..a6fb84f76b832 100644
--- a/paddle/cinn/hlir/pass/constant_folding_pass_util.cc
+++ b/paddle/cinn/hlir/pass/constant_folding_pass_util.cc
@@ -21,7 +21,7 @@
 #include "paddle/cinn/hlir/op/op_util.h"
 #include "paddle/cinn/utils/functional.h"
 #include "paddle/cinn/utils/type_defs.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace hlir {
 namespace pass {
@@ -238,7 +238,10 @@ void fold_expand_dims_fill_constant(const FusionHelperBase* helper,
   // [0, total_size-1]. check axes can't repeat.
   std::sort(axes.begin(), axes.end(), std::less<int>());
   for (int idx = 0; idx < axes_size - 1; ++idx) {
-    CHECK_NE(axes[idx], axes[idx + 1]);
+    PADDLE_ENFORCE_NE(axes[idx],
+                      axes[idx + 1],
+                      phi::errors::InvalidArgument(
+                          "The axes of expand_dims should not repeat."));
   }
   // insert 1 to new shape.
   std::vector<int> n_shape(total_size, 1);
diff --git a/paddle/cinn/hlir/pass/dce_pass.cc b/paddle/cinn/hlir/pass/dce_pass.cc
index b17f8ee4de5d9..2a68e90bc342a 100644
--- a/paddle/cinn/hlir/pass/dce_pass.cc
+++ b/paddle/cinn/hlir/pass/dce_pass.cc
@@ -16,7 +16,7 @@
 
 #include "paddle/cinn/common/type.h"
 #include "paddle/cinn/hlir/pass/op_fusion_pass_util.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace hlir {
 namespace pass {
@@ -118,7 +118,10 @@ class DceHelper : public FusionHelperBase {
 };
 
 void DCEPassInternal(Graph* graph) {
-  CHECK_GT(graph->outputs.size(), 0);
+  PADDLE_ENFORCE_GT(graph->outputs.size(),
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The graph should have at least one output node."));
   DceHelper dce_helper(graph);
   dce_helper();
 }
diff --git a/paddle/cinn/hlir/pass/dce_pass_test.cc b/paddle/cinn/hlir/pass/dce_pass_test.cc
index bb9c5d7654851..1ebc0878ee2cb 100644
--- a/paddle/cinn/hlir/pass/dce_pass_test.cc
+++ b/paddle/cinn/hlir/pass/dce_pass_test.cc
@@ -15,7 +15,7 @@
 #include <gtest/gtest.h>
 
 #include "paddle/cinn/frontend/decomposer/test_helper.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace frontend {
 
@@ -36,7 +36,10 @@ TEST(DCE, Test_0) {
       std::make_shared<hlir::framework::Graph>(program, fetch_ids, target);
   hlir::framework::ApplyPass(graph.get(), "DCE");
 
-  CHECK_EQ(graph->nodes().size(), 4);
+  PADDLE_ENFORCE_EQ(
+      graph->nodes().size(),
+      4,
+      phi::errors::InvalidArgument("The graph nodes's size should be 4."));
 }
 
 TEST(DCE, Test_1) {
@@ -59,7 +62,10 @@ TEST(DCE, Test_1) {
   auto graph =
       std::make_shared<hlir::framework::Graph>(program, fetch_ids, target);
   hlir::framework::ApplyPass(graph.get(), "DCE");
-  CHECK_EQ(graph->nodes().size(), 8);
+  PADDLE_ENFORCE_EQ(
+      graph->nodes().size(),
+      8,
+      phi::errors::InvalidArgument("The graph nodes's size should be 8."));
 }
 
 }  // namespace frontend
diff --git a/paddle/cinn/hlir/pass/dense_merge_pass.cc b/paddle/cinn/hlir/pass/dense_merge_pass.cc
index a726aa1a36c1a..1fc5e4a52b60d 100644
--- a/paddle/cinn/hlir/pass/dense_merge_pass.cc
+++ b/paddle/cinn/hlir/pass/dense_merge_pass.cc
@@ -15,7 +15,7 @@
 #include "paddle/cinn/common/graph_utils.h"
 #include "paddle/cinn/common/type.h"
 #include "paddle/cinn/hlir/pass/fusion_helper_base.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace hlir {
 namespace pass {
@@ -100,7 +100,13 @@ class DenseMergePassHelper : public FusionHelperBase {
     std::unordered_map<std::string, std::vector<Node*>> dense_op_map;
     for (auto dense_op : dense_ops) {
       const auto& in_links = dense_op->inlinks_in_order();
-      CHECK_GT(in_links.size(), pos);
+      PADDLE_ENFORCE_GT(in_links.size(),
+                        pos,
+                        phi::errors::InvalidArgument(
+                            "The input link size of dense op should be greater "
+                            "than %d, but got %d.",
+                            pos,
+                            in_links.size()));
       auto sign = GenOpSign(in_links[pos]->source()->safe_as<NodeData>(),
                             dense_op->attrs);
       if (dense_op_map.count(sign)) {
@@ -131,7 +137,14 @@ class DenseMergePassHelper : public FusionHelperBase {
         const auto& in_links = op->inlinks_in_order();
         node->UnLinkSingleTo(op);
         // link to new node
-        CHECK_GT(in_links.size(), pos);
+        PADDLE_ENFORCE_GT(
+            in_links.size(),
+            pos,
+            phi::errors::InvalidArgument("The input link size of dense "
+                                         "op should be greater than %d, "
+                                         "but got %d.",
+                                         pos,
+                                         in_links.size()));
         in_links[pos]->source()->LinkTo(node_tmp);
         // unlink old dense node
         in_links[pos]->source()->UnLinkSingleTo(op);
diff --git a/paddle/cinn/hlir/pass/dot_merger.cc b/paddle/cinn/hlir/pass/dot_merger.cc
index 941cf6b29b66c..6e4e4108ecd91 100644
--- a/paddle/cinn/hlir/pass/dot_merger.cc
+++ b/paddle/cinn/hlir/pass/dot_merger.cc
@@ -16,7 +16,7 @@
 #include "paddle/cinn/hlir/framework/graph.h"
 #include "paddle/cinn/hlir/framework/pass.h"
 #include "paddle/cinn/hlir/pass/infershape.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace hlir {
 namespace pass {
@@ -368,9 +368,12 @@ class DotMergerPass {
           input_operand(merge_nodes[i - 1], axis)->id());
       auto shape_b =
           builder->shape_dict().at(input_operand(merge_nodes[i], axis)->id());
-      CHECK_EQ(shape_a[1 - axis], shape_b[1 - axis])
-          << "The shape of matmul is error. " << shape_a.size() << ", "
-          << shape_b.size();
+      PADDLE_ENFORCE_EQ(
+          shape_a[1 - axis],
+          shape_b[1 - axis],
+          phi::errors::InvalidArgument("The shape of matmul is error. %d, %d",
+                                       shape_a.size(),
+                                       shape_b.size()));
       concat_nodes.push_back(input_operand(merge_nodes[i], axis));
     }
     auto* concat_out = builder->Concat(axis, concat_nodes);
@@ -444,9 +447,12 @@ class DotMergerPass {
     auto shape_shared = builder->shape_dict().at(shared_input->id());
     auto shape_a = builder->shape_dict().at(input_a->id());
     auto shape_b = builder->shape_dict().at(input_b->id());
-    CHECK_EQ(shape_a[1 - axis], shape_b[1 - axis])
-        << "The shape of matmul is error. " << shape_a.size() << ", "
-        << shape_b.size();
+    PADDLE_ENFORCE_EQ(
+        shape_a[1 - axis],
+        shape_b[1 - axis],
+        phi::errors::InvalidArgument("The shape of matmul is error. %d, %d",
+                                     shape_a.size(),
+                                     shape_b.size()));
     auto* concat_out = builder->Concat(axis, {input_a, input_b});
     NodeData* matmul_out{};
     if (!lhs) {
diff --git a/paddle/cinn/hlir/pass/fusion_helper_base.h b/paddle/cinn/hlir/pass/fusion_helper_base.h
index 3437b334fa5df..79580815d91bf 100644
--- a/paddle/cinn/hlir/pass/fusion_helper_base.h
+++ b/paddle/cinn/hlir/pass/fusion_helper_base.h
@@ -23,7 +23,7 @@
 #include "paddle/cinn/hlir/framework/pass.h"
 #include "paddle/cinn/hlir/pass/use_pass.h"
 #include "paddle/cinn/utils/string.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace hlir {
 namespace pass {
@@ -104,7 +104,10 @@ class FusionHelperBase {
 
   shape_t GetNodeInputShape(const Node* node) const {
     auto node_datas = GetProducerNodeData(node);
-    CHECK_GT(node_datas.size(), 0);
+    PADDLE_ENFORCE_GT(
+        node_datas.size(),
+        0,
+        phi::errors::InvalidArgument("The input node should not be empty!"));
     CHECK(shape_dict_.count(node_datas[0]->id()))
         << "Can't find " << node_datas[0]->id() << " 's shape!";
     return shape_dict_.at(node_datas[0]->id());
@@ -168,7 +171,10 @@ class FusionHelperBase {
 
   int GetSharedSize(const Node* node) const {
     auto producers = GetProducerNodeData(node);
-    CHECK_GT(producers.size(), 0);
+    PADDLE_ENFORCE_GT(
+        producers.size(),
+        0,
+        phi::errors::InvalidArgument("The input node should not be empty!"));
     auto inshape = shape_dict_.at(producers[0]->id());
     auto axes = absl::get<std::vector<int>>(node->attrs.attr_store.at("dim"));
     if (WithoutLastDimInReduce(inshape, axes)) {
diff --git a/paddle/cinn/hlir/pass/fusion_merge_pass.cc b/paddle/cinn/hlir/pass/fusion_merge_pass.cc
index fd023662f9050..0d93dd1593c4f 100644
--- a/paddle/cinn/hlir/pass/fusion_merge_pass.cc
+++ b/paddle/cinn/hlir/pass/fusion_merge_pass.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/cinn/hlir/pass/fusion_merge_pass_util.h"
-
+#include "paddle/common/enforce.h"
 PD_DECLARE_bool(enhance_vertical_fusion_with_recompute);
 
 namespace cinn {
@@ -705,7 +705,11 @@ class FusionMergePassHelper : public FusionHelperBase {
         }
       }
 
-      CHECK_GE(producer->consumer_groups().size(), candidates.size());
+      PADDLE_ENFORCE_GE(producer->consumer_groups().size(),
+                        candidates.size(),
+                        phi::errors::InvalidArgument(
+                            "The number of candidates should be less than or "
+                            "equal to the number of consumer groups!"));
       if (producer->consumer_groups().size() == 0 && candidates.size() == 0 &&
           output_nodes_set_.count(producer->CollectNodes()[0]) == 0) {
         producer->belong_groups.insert(*fusionable_consumers->begin());
@@ -959,8 +963,16 @@ class FusionMergePassHelper : public FusionHelperBase {
         CHECK(consumer->belong_groups.size());
         consumers.insert(*consumer->belong_groups.begin());
       }
-      CHECK_EQ(group->producer_groups().size(), producers.size());
-      CHECK_EQ(group->consumer_groups().size(), consumers.size());
+      PADDLE_ENFORCE_EQ(group->producer_groups().size(),
+                        producers.size(),
+                        phi::errors::InvalidArgument(
+                            "The number of producers should be equal to the "
+                            "number of producer groups!"));
+      PADDLE_ENFORCE_EQ(group->consumer_groups().size(),
+                        consumers.size(),
+                        phi::errors::InvalidArgument(
+                            "The number of consumers should be equal to the "
+                            "number of consumer groups!"));
       (*group->mut_producer_groups()) = producers;
       (*group->mut_consumer_groups()) = consumers;
     }
diff --git a/paddle/cinn/hlir/pass/fusion_merge_pass_test.cc b/paddle/cinn/hlir/pass/fusion_merge_pass_test.cc
old mode 100755
new mode 100644
index f6f9ecee97c43..14cc221edaaf0
--- a/paddle/cinn/hlir/pass/fusion_merge_pass_test.cc
+++ b/paddle/cinn/hlir/pass/fusion_merge_pass_test.cc
@@ -15,7 +15,7 @@
 #include <gtest/gtest.h>
 
 #include "paddle/cinn/frontend/decomposer/test_helper.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace frontend {
 
@@ -39,9 +39,15 @@ TEST(FusionMergePass, ElementWise_Fusion_0) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 3);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    3,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 3."));
   hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
-  CHECK_EQ(graph->fusion_groups.size(), 1);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    1,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 1."));
 }
 
 TEST(FusionMergePass, ElementWise_Fusion_1) {
@@ -65,9 +71,15 @@ TEST(FusionMergePass, ElementWise_Fusion_1) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 4);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    4,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 4."));
   hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
-  CHECK_EQ(graph->fusion_groups.size(), 1);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    1,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 1."));
 }
 
 TEST(FusionMergePass, ElementWise_Fusion_2) {
@@ -94,9 +106,15 @@ TEST(FusionMergePass, ElementWise_Fusion_2) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 5);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    5,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 5."));
   hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
-  CHECK_EQ(graph->fusion_groups.size(), 1);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    1,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 1."));
 }
 
 TEST(FusionMergePass, ElementWise_Fusion_3) {
@@ -123,9 +141,15 @@ TEST(FusionMergePass, ElementWise_Fusion_3) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 5);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    5,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 5."));
   hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
-  CHECK_EQ(graph->fusion_groups.size(), 1);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    1,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 1."));
 }
 
 TEST(FusionMergePass, ElementWise_Fusion_4) {
@@ -152,9 +176,15 @@ TEST(FusionMergePass, ElementWise_Fusion_4) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 5);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    5,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 5."));
   hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
-  CHECK_EQ(graph->fusion_groups.size(), 1);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    1,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 1."));
 }
 
 TEST(FusionMergePass, ElementWise_Fusion_5) {
@@ -174,9 +204,15 @@ TEST(FusionMergePass, ElementWise_Fusion_5) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 2);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 2."));
   hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
-  CHECK_EQ(graph->fusion_groups.size(), 1);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    1,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 1."));
 }
 
 TEST(FusionMergePass, Broadcast_Test_0) {
@@ -199,9 +235,15 @@ TEST(FusionMergePass, Broadcast_Test_0) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 1);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    1,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 1."));
   hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
-  CHECK_EQ(graph->fusion_groups.size(), 1);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    1,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 1."));
 }
 
 TEST(FusionMergePass, Broadcast_Test_1) {
@@ -224,9 +266,15 @@ TEST(FusionMergePass, Broadcast_Test_1) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 3);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    3,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 3."));
   hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
-  CHECK_EQ(graph->fusion_groups.size(), 1);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    1,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 1."));
 }
 
 TEST(FusionMergePass, Broadcast_Test_2) {
@@ -249,9 +297,15 @@ TEST(FusionMergePass, Broadcast_Test_2) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 3);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    3,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 3."));
   hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
-  CHECK_EQ(graph->fusion_groups.size(), 2);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 2."));
 }
 
 TEST(FusionMergePass, Broadcast_Test_3) {
@@ -274,9 +328,15 @@ TEST(FusionMergePass, Broadcast_Test_3) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 3);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    3,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 3."));
   hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
-  CHECK_EQ(graph->fusion_groups.size(), 2);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 2."));
 }
 
 TEST(FusionMergePass, Broadcast_Test_4) {
@@ -301,9 +361,15 @@ TEST(FusionMergePass, Broadcast_Test_4) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 4);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    4,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 4."));
   hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
-  CHECK_EQ(graph->fusion_groups.size(), 2);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 2."));
 }
 
 TEST(FusionMergePass, Broadcast_Test_5) {
@@ -328,9 +394,15 @@ TEST(FusionMergePass, Broadcast_Test_5) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 4);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    4,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 4."));
   hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
-  CHECK_EQ(graph->fusion_groups.size(), 3);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    3,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 3."));
 }
 
 TEST(FusionMergePass, Reduce_Test_0) {
@@ -352,7 +424,10 @@ TEST(FusionMergePass, Reduce_Test_0) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 4);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    4,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 4."));
   hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
   // CHECK_EQ(graph->fusion_groups.size(), 2);
 }
@@ -375,9 +450,15 @@ TEST(FusionMergePass, Reduce_Test_1) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 3);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    3,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 3."));
   hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
-  CHECK_EQ(graph->fusion_groups.size(), 2);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 2."));
 }
 
 TEST(FusionMergePass, Reduce_Test_2) {
@@ -401,9 +482,15 @@ TEST(FusionMergePass, Reduce_Test_2) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 3);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    3,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 3."));
   hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
-  CHECK_EQ(graph->fusion_groups.size(), 2);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 2."));
 }
 
 TEST(FusionMergePass, Reduce_Test_3) {
@@ -427,7 +514,10 @@ TEST(FusionMergePass, Reduce_Test_3) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 4);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    4,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 4."));
   hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
   // CHECK_EQ(graph->fusion_groups.size(), 3);
 }
@@ -454,7 +544,10 @@ TEST(FusionMergePass, Reduce_Test_4) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 5);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    5,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 5."));
   hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
   // CHECK_EQ(graph->fusion_groups.size(), 3);
 }
@@ -478,9 +571,15 @@ TEST(FusionMergePass, Reduce_Test_5) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 3);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    3,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 3."));
   hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
-  CHECK_EQ(graph->fusion_groups.size(), 1);
+  PADDLE_ENFORCE_EQ(graph->fusion_groups.size(),
+                    1,
+                    phi::errors::InvalidArgument(
+                        "The graph fusion groups's size should be 1."));
 }
 
 }  // namespace frontend
diff --git a/paddle/cinn/hlir/pass/general_fusion_merge_pass.cc b/paddle/cinn/hlir/pass/general_fusion_merge_pass.cc
index b9d553019a459..b27565194f293 100644
--- a/paddle/cinn/hlir/pass/general_fusion_merge_pass.cc
+++ b/paddle/cinn/hlir/pass/general_fusion_merge_pass.cc
@@ -25,7 +25,7 @@
 #include "paddle/cinn/hlir/pass/general_fusion_merge_pass/lightware_fuse_pass.h"
 #include "paddle/cinn/hlir/pass/general_fusion_merge_pass/lightware_fuse_pass_ctx.h"
 #include "paddle/cinn/hlir/pass/general_fusion_merge_pass_utils.h"
-
+#include "paddle/common/enforce.h"
 PD_DECLARE_bool(enhance_vertical_fusion_with_recompute);
 
 namespace cinn {
@@ -840,7 +840,11 @@ class GeneralFusionMergePassHelper : public FusionHelperBase {
         }
       }
 
-      CHECK_GE(producer->consumer_groups().size(), candidates.size());
+      PADDLE_ENFORCE_GE(
+          producer->consumer_groups().size(),
+          candidates.size(),
+          phi::errors::Fatal("The number of candidates should be less than or "
+                             "equal to the number of consumers."));
       if (producer->consumer_groups().size() == 0 && candidates.size() == 0 &&
           output_nodes_set_.count(producer->CollectNodes()[0]) == 0) {
         producer->belong_groups.insert(*fusionable_consumers->begin());
@@ -1035,8 +1039,14 @@ class GeneralFusionMergePassHelper : public FusionHelperBase {
         CHECK(consumer->belong_groups.size());
         consumers.insert(*consumer->belong_groups.begin());
       }
-      CHECK_EQ(group->producer_groups().size(), producers.size());
-      CHECK_EQ(group->consumer_groups().size(), consumers.size());
+      PADDLE_ENFORCE_EQ(
+          group->producer_groups().size(),
+          producers.size(),
+          phi::errors::InvalidArgument("Producer size is not equal!"));
+      PADDLE_ENFORCE_EQ(
+          group->consumer_groups().size(),
+          consumers.size(),
+          phi::errors::InvalidArgument("Consumer size is not equal!"));
       (*group->mut_producer_groups()) = producers;
       (*group->mut_consumer_groups()) = consumers;
     }
diff --git a/paddle/cinn/hlir/pass/general_fusion_merge_pass_utils.h b/paddle/cinn/hlir/pass/general_fusion_merge_pass_utils.h
index 2195d4a4f947b..a8ccbcef27a16 100644
--- a/paddle/cinn/hlir/pass/general_fusion_merge_pass_utils.h
+++ b/paddle/cinn/hlir/pass/general_fusion_merge_pass_utils.h
@@ -16,7 +16,7 @@
 
 #include "paddle/cinn/api/op_group.h"
 #include "paddle/cinn/hlir/pass/fusion_merge_pass_util.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace hlir {
 namespace pass {
@@ -135,7 +135,10 @@ inline bool WithoutLastDimInReduce(const api::Shape& inshape,
 
 static int GetSharedSize(const api::OpNode& op_node) {
   const auto& producers = op_node.inputs();
-  CHECK_GT(producers.size(), 0);
+  PADDLE_ENFORCE_GT(producers.size(),
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The producer size should be greater than 0."));
   const auto& inshape = producers[0].shape();
   const auto& axes = op_node.GetAttr<std::vector<int>>("dim");
   if (WithoutLastDimInReduce(inshape, axes)) {
diff --git a/paddle/cinn/hlir/pass/infershape.cc b/paddle/cinn/hlir/pass/infershape.cc
index 041a63b42b57c..c6a7a6422d8a8 100644
--- a/paddle/cinn/hlir/pass/infershape.cc
+++ b/paddle/cinn/hlir/pass/infershape.cc
@@ -19,7 +19,7 @@
 #include "paddle/cinn/hlir/pass/use_pass.h"
 #include "paddle/cinn/hlir/pe/schedule.h"
 #include "paddle/cinn/utils/string.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace hlir {
 namespace pass {
@@ -76,16 +76,16 @@ void InferShape(Node* node,
   auto out_dtype =
       op_inferdtype[node->op()](inputs_dtype, node->attrs.attr_store);
 
-  CHECK_GE(node->outlinks_in_order().size(), out_shape.size())
-      << "The output number of node " << node->id() << " is "
-      << node->outlinks_in_order().size()
-      << " , which is smaller than the output shape size " << out_shape.size()
-      << " . And the op type is " << node->op()->name;
-  CHECK_GE(node->outlinks_in_order().size(), out_dtype.size())
-      << "The output number of node " << node->id() << " is "
-      << node->outlinks_in_order().size()
-      << " , which is smaller than the output dtype size " << out_dtype.size()
-      << " . And the op type is " << node->op()->name;
+  PADDLE_ENFORCE_GE(
+      node->outlinks_in_order().size(),
+      out_shape.size(),
+      phi::errors::InvalidArgument("The output number of node is smaller "
+                                   "than the output shape size"));
+  PADDLE_ENFORCE_GE(
+      node->outlinks_in_order().size(),
+      out_dtype.size(),
+      phi::errors::InvalidArgument("The output number of node is smaller "
+                                   "than the output dtype size"));
 
   int counter = 0;
   for (auto& out_edge : node->outlinks_in_order()) {
diff --git a/paddle/cinn/hlir/pass/op_fusion_pass_test.cc b/paddle/cinn/hlir/pass/op_fusion_pass_test.cc
old mode 100755
new mode 100644
index c9d723c91be50..8c18782cc031d
--- a/paddle/cinn/hlir/pass/op_fusion_pass_test.cc
+++ b/paddle/cinn/hlir/pass/op_fusion_pass_test.cc
@@ -15,7 +15,7 @@
 #include <gtest/gtest.h>
 
 #include "paddle/cinn/frontend/decomposer/test_helper.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace frontend {
 
@@ -39,7 +39,10 @@ TEST(OpFusionPass, ElementWise_Fusion_0) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 1);
+  PADDLE_ENFORCE_EQ(
+      graph->fusion_groups.size(),
+      1,
+      phi::errors::InvalidArgument("fusion group size should be 1"));
 }
 
 TEST(OpFusionPass, ElementWise_Fusion_1) {
@@ -63,7 +66,10 @@ TEST(OpFusionPass, ElementWise_Fusion_1) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 1);
+  PADDLE_ENFORCE_EQ(
+      graph->fusion_groups.size(),
+      1,
+      phi::errors::InvalidArgument("fusion group size should be 1"));
 }
 
 TEST(OpFusionPass, Broadcast_Test_0) {
@@ -86,7 +92,10 @@ TEST(OpFusionPass, Broadcast_Test_0) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 1);
+  PADDLE_ENFORCE_EQ(
+      graph->fusion_groups.size(),
+      1,
+      phi::errors::InvalidArgument("fusion group size should be 1"));
 }
 
 TEST(OpFusionPass, Broadcast_Test_1) {
@@ -111,7 +120,10 @@ TEST(OpFusionPass, Broadcast_Test_1) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 1);
+  PADDLE_ENFORCE_EQ(
+      graph->fusion_groups.size(),
+      1,
+      phi::errors::InvalidArgument("fusion group size should be 1"));
 }
 
 TEST(OpFusionPass, Broadcast_Test_2) {
@@ -131,7 +143,10 @@ TEST(OpFusionPass, Broadcast_Test_2) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 1);
+  PADDLE_ENFORCE_EQ(
+      graph->fusion_groups.size(),
+      1,
+      phi::errors::InvalidArgument("fusion group size should be 1"));
 }
 
 TEST(OpFusionPass, Reduce_Test_0) {
@@ -155,7 +170,10 @@ TEST(OpFusionPass, Reduce_Test_0) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 2);
+  PADDLE_ENFORCE_EQ(
+      graph->fusion_groups.size(),
+      2,
+      phi::errors::InvalidArgument("fusion group size should be 2"));
 }
 
 TEST(OpFusionPass, Reduce_Test_1) {
@@ -180,7 +198,10 @@ TEST(OpFusionPass, Reduce_Test_1) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 1);
+  PADDLE_ENFORCE_EQ(
+      graph->fusion_groups.size(),
+      1,
+      phi::errors::InvalidArgument("fusion group size should be 1"));
 }
 
 TEST(OpFusionPass, Reduce_Test_2) {
@@ -205,7 +226,10 @@ TEST(OpFusionPass, Reduce_Test_2) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 2);
+  PADDLE_ENFORCE_EQ(
+      graph->fusion_groups.size(),
+      2,
+      phi::errors::InvalidArgument("fusion group size should be 2"));
 }
 
 TEST(OpFusionPass, Injective_Test_0) {
@@ -229,7 +253,10 @@ TEST(OpFusionPass, Injective_Test_0) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 1);
+  PADDLE_ENFORCE_EQ(
+      graph->fusion_groups.size(),
+      1,
+      phi::errors::InvalidArgument("fusion group size should be 1"));
 }
 
 TEST(OP_LOWERING, Injective_Test_1) {
@@ -247,7 +274,10 @@ TEST(OP_LOWERING, Injective_Test_1) {
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
-  CHECK_EQ(graph->fusion_groups.size(), 1);
+  PADDLE_ENFORCE_EQ(
+      graph->fusion_groups.size(),
+      1,
+      phi::errors::InvalidArgument("fusion group size should be 1"));
 }
 
 TEST(OpFusionPass, Test_Insert_BroadcastTo) {
@@ -269,7 +299,10 @@ TEST(OpFusionPass, Test_Insert_BroadcastTo) {
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
 
-  CHECK_EQ(graph->fusion_groups.size(), 1);
+  PADDLE_ENFORCE_EQ(
+      graph->fusion_groups.size(),
+      1,
+      phi::errors::InvalidArgument("fusion group size should be 1"));
 }
 
 }  // namespace frontend
diff --git a/paddle/cinn/hlir/pass/opfusion.cc b/paddle/cinn/hlir/pass/opfusion.cc
index c8690c0625fbb..84a4071144f96 100644
--- a/paddle/cinn/hlir/pass/opfusion.cc
+++ b/paddle/cinn/hlir/pass/opfusion.cc
@@ -21,7 +21,7 @@
 #include "paddle/cinn/hlir/framework/pass.h"
 #include "paddle/cinn/hlir/pass/use_pass.h"
 #include "paddle/cinn/utils/string.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace hlir {
 namespace pass {
@@ -48,8 +48,14 @@ void GetBroadcastPattern(
   if (*pattern == framework::kBroadcast) {
     auto inlinks = op_node->inlinks();
     auto outlinks = op_node->outlinks();
-    CHECK_EQ(inlinks.size(), 2U);
-    CHECK_EQ(outlinks.size(), 1U);
+    PADDLE_ENFORCE_EQ(
+        inlinks.size(),
+        2U,
+        phi::errors::InvalidArgument("Broadcast op should have 2 inputs"));
+    PADDLE_ENFORCE_EQ(
+        outlinks.size(),
+        1U,
+        phi::errors::InvalidArgument("Broadcast op should have 1 output"));
     std::vector<framework::shape_t> input_shapes;
     for (auto link : inlinks) {
       auto source = link->source();
@@ -233,7 +239,11 @@ class GraphPartition {
   std::vector<std::vector<Node*>> Partition(
       const std::vector<GraphNode*>& graph_nodes,
       const std::vector<DomNode*>& dom_nodes) {
-    CHECK_EQ(graph_nodes.size(), dom_nodes.size());
+    PADDLE_ENFORCE_EQ(
+        graph_nodes.size(),
+        dom_nodes.size(),
+        phi::errors::InvalidArgument(
+            "graph_nodes size should be equal to dom_nodes size"));
     InitGroups(graph_nodes);
     for (int i = 0; i < 2; i++) {
       FuseGroups(graph_nodes, dom_nodes, i);
@@ -457,8 +467,16 @@ class GraphPartition {
   void FuseGroups(const std::vector<GraphNode*>& graph_nodes,
                   const std::vector<DomNode*>& dom_nodes,
                   int phase) {
-    CHECK_EQ(graph_nodes.size(), dom_nodes.size());
-    CHECK_EQ(group_nodes_.size(), dom_nodes.size());
+    PADDLE_ENFORCE_EQ(
+        graph_nodes.size(),
+        dom_nodes.size(),
+        phi::errors::InvalidArgument(
+            "graph_nodes size should be equal to dom_nodes size"));
+    PADDLE_ENFORCE_EQ(
+        group_nodes_.size(),
+        dom_nodes.size(),
+        phi::errors::InvalidArgument(
+            "group_nodes size should be equal to dom_nodes size"));
     for (int i = 0; i < graph_nodes.size(); i++) {
       auto* graph_node = graph_nodes[i];
       auto* dom_node = dom_nodes[i];
@@ -521,7 +539,11 @@ class GraphPartition {
   }
   void SplitGroups(const std::vector<cinn::common::GraphNode*>& graph_nodes) {
     // split groups sorted by topo order
-    CHECK_EQ(graph_nodes.size(), group_nodes_.size());
+    PADDLE_ENFORCE_EQ(
+        graph_nodes.size(),
+        group_nodes_.size(),
+        phi::errors::InvalidArgument(
+            "graph_nodes size should be equal to group_nodes size"));
     absl::flat_hash_map<int, std::vector<Node*>> group_maps;
     std::set<int> root_indice;
     for (int i = 0; i < graph_nodes.size(); i++) {
diff --git a/paddle/cinn/hlir/pass/reduce_split_pass.cc b/paddle/cinn/hlir/pass/reduce_split_pass.cc
index 899c233866ca5..cbb6ffa658c47 100644
--- a/paddle/cinn/hlir/pass/reduce_split_pass.cc
+++ b/paddle/cinn/hlir/pass/reduce_split_pass.cc
@@ -18,7 +18,7 @@
 #include "paddle/cinn/hlir/framework/pass.h"
 #include "paddle/cinn/hlir/pass/infershape.h"
 #include "paddle/cinn/hlir/pe/nn_util.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace hlir {
 namespace pass {
@@ -103,7 +103,11 @@ class ReduceSplitPass {
         auto in_shape = shape_dict.at(in->id());
         auto out_shape = shape_dict.at(out->id());
         // all preceding reduced
-        CHECK_GT(in_shape.size(), 1);
+        PADDLE_ENFORCE_GT(
+            in_shape.size(),
+            1,
+            phi::errors::InvalidArgument(
+                "The input shape size should be greater than 1."));
         // [NHWC]->[C], only the last dim kept
         bool all_preceding_dim_reduced = true;
         for (auto i = 0; i < in_shape.size() - 1; ++i) {
@@ -122,7 +126,10 @@ class ReduceSplitPass {
             in_shape.begin(), in_shape.end(), 1, std::multiplies<int>());
         int reduce_numel = std::accumulate(
             in_shape.begin(), in_shape.end() - 1, 1, std::multiplies<int>());
-        CHECK_GT(reduce_numel, 0);
+        PADDLE_ENFORCE_GT(reduce_numel,
+                          0,
+                          phi::errors::InvalidArgument(
+                              "The reduce_numel should be greater than 0."));
         // if the numel is not large enough, it is no need to split
         // if loop times is too large with reduce optimize
         int size = std::accumulate(
@@ -132,7 +139,10 @@ class ReduceSplitPass {
         auto shape = pe::GetFirstStepReduceShape(
             {size, in_shape.back()}, {0}, bound, tail);
         CHECK(bound);
-        CHECK_EQ(shape.size(), 3);
+        PADDLE_ENFORCE_EQ(shape.size(),
+                          3,
+                          phi::errors::InvalidArgument(
+                              "The shape size should be equal to 3."));
 
         auto res = DivideToClosetNum(reduce_numel);
         int reduce_numel0 = std::get<0>(res), reduce_numel1 = std::get<1>(res);
diff --git a/paddle/cinn/hlir/pe/elementwise.cc b/paddle/cinn/hlir/pe/elementwise.cc
index 41eb7f2fd2c10..41deddc1507e3 100644
--- a/paddle/cinn/hlir/pe/elementwise.cc
+++ b/paddle/cinn/hlir/pe/elementwise.cc
@@ -360,8 +360,8 @@ ir::Tensor GenerateShape(const std::vector<ir::Tensor>& inputs,
                          const std::vector<symbol::DimExpr>& output_dim_exprs,
                          const std::string& name) {
   if (output_dim_exprs.size() != 1) {
-    LOG(WARNING) << "pe::GenerateShape will return a meaningless tensor when "
-                    "output_dim_exprs.size() != 1";
+    VLOG(4) << "pe::GenerateShape will return a meaningless tensor when "
+               "output_dim_exprs.size() != 1";
     return Compute(
         {Expr(1)},
         [=](const std::vector<Expr>& indice) { return Expr(1); },
diff --git a/paddle/cinn/hlir/pe/schedule_param.proto b/paddle/cinn/hlir/pe/schedule_param.proto
index 1d869a570706d..4d2fca1a1b362 100644
--- a/paddle/cinn/hlir/pe/schedule_param.proto
+++ b/paddle/cinn/hlir/pe/schedule_param.proto
@@ -1,11 +1,11 @@
 // Copyright (c) 2021 CINN Authors. All Rights Reserved.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
-// 
+//
 //     http://www.apache.org/licenses/LICENSE-2.0
-// 
+//
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/paddle/cinn/ir/group_schedule/base_group_scheduler.h b/paddle/cinn/ir/group_schedule/base_group_scheduler.h
index ef77397066351..a96b972d889ea 100644
--- a/paddle/cinn/ir/group_schedule/base_group_scheduler.h
+++ b/paddle/cinn/ir/group_schedule/base_group_scheduler.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+#include "paddle/cinn/common/macros.h"
 #include "paddle/cinn/common/target.h"
 #include "paddle/cinn/ir/group_schedule/config/group_tile_config.h"
 #include "paddle/cinn/ir/group_schedule/tactic/schedule_tactic.h"
@@ -64,6 +65,9 @@ class GroupScheduler {
   virtual void Schedule() = 0;
 
   virtual std::vector<std::pair<SymbolicPredicate, ir::Expr>> GetIRs() = 0;
+  virtual std::vector<std::pair<SymbolicPredicate, ir::Expr>> GetCX86IRs() {
+    CINN_NOT_IMPLEMENTED;
+  }
 
   std::unordered_set<std::string> OutputTensorNames() const;
 
diff --git a/paddle/cinn/ir/group_schedule/config/CMakeLists.txt b/paddle/cinn/ir/group_schedule/config/CMakeLists.txt
index f6453b645bdc7..256e919fce531 100644
--- a/paddle/cinn/ir/group_schedule/config/CMakeLists.txt
+++ b/paddle/cinn/ir/group_schedule/config/CMakeLists.txt
@@ -5,7 +5,10 @@ core_gather_headers()
 gather_srcs(cinnapi_src SRCS group_tile_config.cc)
 gather_srcs(cinnapi_src SRCS database.cc)
 
-cc_library(file_tile_database SRCS filedatabase.cc)
+cc_library(
+  file_tile_database
+  SRCS filedatabase.cc
+  DEPS absl tile_config_proto)
 
 foreach(header ${filetileconfig_proto_HDRS})
   set(core_proto_includes
diff --git a/paddle/cinn/ir/group_schedule/config/database.cc b/paddle/cinn/ir/group_schedule/config/database.cc
index a216530126efd..4e3121739b874 100644
--- a/paddle/cinn/ir/group_schedule/config/database.cc
+++ b/paddle/cinn/ir/group_schedule/config/database.cc
@@ -19,10 +19,16 @@ namespace ir {
 
 void NaiveTileConfigDatabase::AddConfig(
     const common::Target& target,
-    const IterSpaceType& iter_space_type,
     const BucketInfo& bucket_info,
     const ScheduleConfig::TileConfig& config,
     int priority) {
+  IterSpaceType iter_space_type = [&] {
+    std::vector<std::pair<std::string, std::string>> res;
+    for (const auto& dim : bucket_info.space) {
+      res.emplace_back(dim.iter_type, (dim.is_dynamic ? "dynamic" : "static"));
+    }
+    return res;
+  }();
   config_map_[iter_space_type][bucket_info] = config;
 }
 
diff --git a/paddle/cinn/ir/group_schedule/config/database.h b/paddle/cinn/ir/group_schedule/config/database.h
index 9d61f0dd615a5..14367ee492bba 100644
--- a/paddle/cinn/ir/group_schedule/config/database.h
+++ b/paddle/cinn/ir/group_schedule/config/database.h
@@ -32,7 +32,6 @@ using IterSpaceType = std::vector<std::pair<std::string, std::string>>;
 class TileConfigDatabase {
  public:
   virtual void AddConfig(const common::Target& target,
-                         const IterSpaceType& iter_space_type,
                          const BucketInfo& bucket_info,
                          const ScheduleConfig::TileConfig& config,
                          int priority) = 0;
@@ -45,7 +44,6 @@ class TileConfigDatabase {
 class NaiveTileConfigDatabase final : public TileConfigDatabase {
  public:
   void AddConfig(const common::Target& target,
-                 const IterSpaceType& iter_space_type,
                  const BucketInfo& bucket_info,
                  const ScheduleConfig::TileConfig& config,
                  int priority = 1) override;
diff --git a/paddle/cinn/ir/group_schedule/config/filedatabase.cc b/paddle/cinn/ir/group_schedule/config/filedatabase.cc
index 64741521802e9..58b5f13338f0a 100644
--- a/paddle/cinn/ir/group_schedule/config/filedatabase.cc
+++ b/paddle/cinn/ir/group_schedule/config/filedatabase.cc
@@ -39,22 +39,19 @@ namespace ir {
 
 bool TileConfigToProto(group_schedule::config::proto::TileData* tile_data,
                        const TileConfigMap& tile_config_map,
-                       const IterSpaceType& iter_space_type,
                        const int& priority) {
   for (auto& it : tile_config_map) {
-    group_schedule::config::proto::Dimension s_dimension, r_dimension;
-
     // prepare key---convert bucket info to proto::bucket_info
-    s_dimension.set_lower_bound(it.first.sp_lower_bound);
-    s_dimension.set_upper_bound(it.first.sp_upper_bound);
-    s_dimension.set_iter_type(iter_space_type[0].first);
-    s_dimension.set_is_dynamic(iter_space_type[0].second == "dynamic");
-    r_dimension.set_lower_bound(it.first.rb_lower_bound);
-    r_dimension.set_upper_bound(it.first.rb_upper_bound);
-    r_dimension.set_iter_type(iter_space_type[1].first);
-    r_dimension.set_is_dynamic(iter_space_type[1].second == "dynamic");
-    *(tile_data->mutable_bucket_info()->add_dimension()) = s_dimension;
-    *(tile_data->mutable_bucket_info()->add_dimension()) = r_dimension;
+    BucketInfo bucket_info = it.first;
+    int dims = bucket_info.space.size();
+    for (int i = 0; i < dims; i++) {
+      group_schedule::config::proto::Dimension cur_dimension;
+      cur_dimension.set_lower_bound(bucket_info.space[i].lower_bound);
+      cur_dimension.set_upper_bound(bucket_info.space[i].upper_bound);
+      cur_dimension.set_iter_type(bucket_info.space[i].iter_type);
+      cur_dimension.set_is_dynamic(bucket_info.space[i].is_dynamic);
+      *(tile_data->mutable_bucket_info()->add_dimension()) = cur_dimension;
+    }
 
     // prepare value---transfer tile_config to proto::tile_config
     group_schedule::config::proto::TileConfig tc;
@@ -114,18 +111,24 @@ std::string IterSpaceTypeToDir(const common::Target target,
 }
 
 bool FileTileConfigDatabase::Tofile(const common::Target& target,
-                                    const IterSpaceType& iter_space_type,
                                     int priority) {
   // Step1. To proto
   TileConfigMap& tile_config_map = target_config_data_;
   group_schedule::config::proto::TileData tile_data;
-  auto is_success =
-      TileConfigToProto(&tile_data, tile_config_map, iter_space_type, priority);
+  auto is_success = TileConfigToProto(&tile_data, tile_config_map, priority);
   if (is_success == false) {
     PADDLE_THROW(::common::errors::Unavailable(
         "Can't convert tile_config_map to its proto message."));
   }
   // Step2. ToJson
+  IterSpaceType iter_space_type = [&] {
+    std::vector<std::pair<std::string, std::string>> res;
+    auto bucket_info = tile_config_map.begin()->first;
+    for (const auto& dim : bucket_info.space) {
+      res.emplace_back(dim.iter_type, (dim.is_dynamic ? "dynamic" : "static"));
+    }
+    return res;
+  }();
   std::string dump_path = IterSpaceTypeToDir(target, iter_space_type);
   size_t length = tile_config_map.size();
   std::vector<std::string> json_lines(length);
@@ -187,7 +190,7 @@ bool comparepriority(group_schedule::config::proto::TileData tile_data1,
 
 TileConfigMap FileTileConfigDatabase::GetConfigs(
     const common::Target& target, const IterSpaceType& iter_space_type) const {
-  // Step1. ReadFromJsonFile->Message;
+  // Step 1: Read from json file and convert json to proto message
   std::string file_path = IterSpaceTypeToDir(target, iter_space_type);
   auto json_lines = ReadLinesFromFile(file_path);
   size_t line_length = json_lines.size();
@@ -196,39 +199,41 @@ TileConfigMap FileTileConfigDatabase::GetConfigs(
       line_length);
   JsonStringToMessageOfTileConfig(&tile_database, json_lines);
 
-  // Step2. ParseFromProtoMessage();
+  // Step 2: Parse from proto message
   TileConfigMap tile_config_map;
   // order tile_database according to priority
   std::sort(tile_database.begin(), tile_database.end(), comparepriority);
   for (const auto& piece_tileconfig : tile_database) {
     group_schedule::config::proto::BucketInfo its =
         piece_tileconfig.bucket_info();
-    // proto::BucketInfo to  bucketinfo
-    BucketInfo bucket_info;
-    bucket_info.sp_lower_bound = its.dimension(0).lower_bound();
-    bucket_info.sp_upper_bound = its.dimension(0).upper_bound();
-    bucket_info.rb_lower_bound = its.dimension(1).lower_bound();
-    bucket_info.rb_upper_bound = its.dimension(1).upper_bound();
+    //  Step 2.1: Convert proto bucketinfo to source bucketinfo
+    int dims = its.dimension_size();
+    BucketInfo bucket_info(static_cast<size_t>(dims));
+    for (int i = 0; i < dims; i++) {
+      bucket_info.space[i].lower_bound = its.dimension(i).lower_bound();
+      bucket_info.space[i].upper_bound = its.dimension(i).upper_bound();
+      bucket_info.space[i].iter_type = its.dimension(i).iter_type();
+      bucket_info.space[i].is_dynamic = its.dimension(i).is_dynamic();
+    }
+    //  Step 2.2: Convert proto tile_config to source tile_config
     ScheduleConfig::TileConfig tconfig;
     tconfig.tree_reduce_num = piece_tileconfig.tile_config().tree_reduce_num();
     tconfig.spatial_inner_num =
         piece_tileconfig.tile_config().spatial_inner_num();
     tconfig.warp_num = piece_tileconfig.tile_config().warp_num();
     tile_config_map[bucket_info] = tconfig;
-    // Tode[XiaZichao] Add function to cut one lattice into smaller ones.
+    // TODO(XiaZichao): Add function to cut one lattice into smaller ones
   }
-  // ToDo[XiaZichao] update json file using top view of tileconfigMap
+  // TODO(XiaZichao): update json file using top view of tileconfigMap
   return tile_config_map;
 }
 
 void FileTileConfigDatabase::AddConfig(const common::Target& target,
-                                       const IterSpaceType& iter_space_type,
                                        const BucketInfo& bucket_info,
                                        const ScheduleConfig::TileConfig& config,
                                        int priority) {
   target_config_data_[bucket_info] = config;
-  auto status =
-      FileTileConfigDatabase::Tofile(target, iter_space_type, priority);
+  auto status = FileTileConfigDatabase::Tofile(target, priority);
   if (status == true) {
     target_config_data_.clear();
     return;
diff --git a/paddle/cinn/ir/group_schedule/config/filedatabase.h b/paddle/cinn/ir/group_schedule/config/filedatabase.h
index 19758dc828c18..3c6b62c676fe8 100644
--- a/paddle/cinn/ir/group_schedule/config/filedatabase.h
+++ b/paddle/cinn/ir/group_schedule/config/filedatabase.h
@@ -22,7 +22,6 @@ namespace ir {
 class FileTileConfigDatabase : TileConfigDatabase {
  public:
   void AddConfig(const common::Target& target,
-                 const IterSpaceType& iter_space_type,
                  const BucketInfo& bucket_info,
                  const ScheduleConfig::TileConfig& config,
                  int priority) override;
@@ -31,9 +30,7 @@ class FileTileConfigDatabase : TileConfigDatabase {
 
  private:
   TileConfigMap target_config_data_;
-  bool Tofile(const common::Target& target,
-              const IterSpaceType& iter_space_type,
-              int priority);
+  bool Tofile(const common::Target& target, int priority);
 };
 
 }  // namespace ir
diff --git a/paddle/cinn/ir/group_schedule/config/group_tile_config.cc b/paddle/cinn/ir/group_schedule/config/group_tile_config.cc
index 40c1d134ac642..42f1a02adf723 100644
--- a/paddle/cinn/ir/group_schedule/config/group_tile_config.cc
+++ b/paddle/cinn/ir/group_schedule/config/group_tile_config.cc
@@ -20,6 +20,47 @@ namespace ir {
 
 const int kMaxNumel = INT32_MAX;
 
+BucketInfo::BucketInfo(int sp_lower_bound,
+                       int sp_upper_bound,
+                       int rb_lower_bound,
+                       int rb_upper_bound,
+                       bool sp_is_dynamic = false,
+                       bool rb_is_dynamic = false) {
+  BucketInfo::Dimension sp_dimension(
+      sp_lower_bound, sp_upper_bound, "S", sp_is_dynamic);
+  BucketInfo::Dimension rb_dimension(
+      rb_lower_bound, rb_upper_bound, "R", rb_is_dynamic);
+  this->space.push_back(sp_dimension);
+  this->space.push_back(rb_dimension);
+}
+
+bool BucketInfo::operator==(const BucketInfo& other) const {
+  if (this->space.size() != other.space.size()) {
+    return false;
+  }
+  int length = this->space.size();
+  for (int i = 0; i < length; i++) {
+    if (this->space[i].is_dynamic != other.space[i].is_dynamic ||
+        this->space[i].iter_type != other.space[i].iter_type ||
+        this->space[i].lower_bound != other.space[i].lower_bound ||
+        this->space[i].upper_bound != other.space[i].upper_bound) {
+      return false;
+    }
+  }
+  return true;
+}
+
+std::string BucketInfo::ToString() const {
+  std::stringstream ss;
+  ss << "BucketInfo: [";
+  for (const auto& dim : space) {
+    ss << dim.iter_type << "(" << dim.lower_bound << " - " << dim.upper_bound
+       << "), ";
+  }
+  ss << "]";
+  return ss.str();
+}
+
 int64_t Next2Power(int64_t n) {
   if (n == 1) {
     return 1;
@@ -34,8 +75,6 @@ std::shared_ptr<ScheduleConfig::BaseInfo> InitBasicInfo(
   base_info->reduce_tensor_names = group_info->reduce_var_names;
   base_info->shared_var_names = group_info->shared_var_names;
   base_info->direct_output_var_names = group_info->direct_output_var_names;
-  base_info->broadcast_info = group_info->broadcast_info;
-  base_info->broadcast_to_elementwise = group_info->broadcast_to_elementwise;
   base_info->data_rank = group_info->data_space.size();
   base_info->raw_data_rank = group_info->raw_data_rank;
 
@@ -190,7 +229,9 @@ BuildStaticSpatialConfig(
     BucketInfo bucket_info{/* sp_lower_bound = */ 1,
                            /* sp_upper_bound = */ 1,
                            /* rb_lower_bound = */ 1,
-                           /* rb_upper_bound = */ kMaxNumel};
+                           /* rb_upper_bound = */ kMaxNumel,
+                           /* sp_is_dynamic = */ false,
+                           /* rb_is_dynamic = */ true};
     ScheduleConfig::TileConfig tile_config{
         /* warp_num = */ 8,
         /* tree_reduce_num = */ 256,
@@ -201,7 +242,9 @@ BuildStaticSpatialConfig(
     BucketInfo bucket_info_1_256{/* sp_lower_bound = */ 1,
                                  /* sp_upper_bound = */ kMaxNumel,
                                  /* rb_lower_bound = */ 1,
-                                 /* rb_upper_bound = */ 256};
+                                 /* rb_upper_bound = */ 256,
+                                 /* sp_is_dynamic = */ false,
+                                 /* rb_is_dynamic = */ true};
     ScheduleConfig::TileConfig tile_config_1_256{
         /* warp_num = */ 8,
         /* tree_reduce_num = */ 32,
@@ -211,7 +254,9 @@ BuildStaticSpatialConfig(
     BucketInfo bucket_info_257_2048{/* sp_lower_bound = */ 1,
                                     /* sp_upper_bound = */ kMaxNumel,
                                     /* rb_lower_bound = */ 257,
-                                    /* rb_upper_bound = */ 2048};
+                                    /* rb_upper_bound = */ 2048,
+                                    /* sp_is_dynamic = */ false,
+                                    /* rb_is_dynamic = */ true};
     ScheduleConfig::TileConfig tile_config_257_2048{
         /* warp_num = */ 8,
         /* tree_reduce_num = */ 128,
@@ -221,7 +266,9 @@ BuildStaticSpatialConfig(
     BucketInfo bucket_info_2049_INF{/* sp_lower_bound = */ 1,
                                     /* sp_upper_bound = */ kMaxNumel,
                                     /* rb_lower_bound = */ 2049,
-                                    /* rb_upper_bound = */ kMaxNumel};
+                                    /* rb_upper_bound = */ kMaxNumel,
+                                    /* sp_is_dynamic = */ false,
+                                    /* rb_is_dynamic = */ true};
     ScheduleConfig::TileConfig tile_config_2049_INF{
         /* warp_num = */ 8,
         /* tree_reduce_num = */ 256,
@@ -242,7 +289,9 @@ BuildStaticReduceConfig(
     BucketInfo bucket_info__1_1023{/* sp_lower_bound = */ 1,
                                    /* sp_upper_bound = */ 1023,
                                    /* rb_lower_bound = */ 1,
-                                   /* rb_upper_bound = */ 1};
+                                   /* rb_upper_bound = */ 1,
+                                   /* sp_is_dynamic = */ true,
+                                   /* rb_is_dynamic = */ false};
     ScheduleConfig::TileConfig tile_config__1_1023{
         /* warp_num = */ -1,
         /* tree_reduce_num = */ 1,
@@ -251,7 +300,9 @@ BuildStaticReduceConfig(
     BucketInfo bucket_info__1024_1M{/* sp_lower_bound = */ 1024,
                                     /* sp_upper_bound = */ 1024 * 1024 - 1,
                                     /* rb_lower_bound = */ 1,
-                                    /* rb_upper_bound = */ 1};
+                                    /* rb_upper_bound = */ 1,
+                                    /* sp_is_dynamic = */ true,
+                                    /* rb_is_dynamic = */ false};
     ScheduleConfig::TileConfig tile_config__1024_1M{
         /* warp_num = */ 32,
         /* tree_reduce_num = */ 1,
@@ -260,7 +311,9 @@ BuildStaticReduceConfig(
     BucketInfo bucket_info__1M_INF{/* sp_lower_bound = */ 1024 * 1024,
                                    /* sp_upper_bound = */ kMaxNumel,
                                    /* rb_lower_bound = */ 1,
-                                   /* rb_upper_bound = */ 1};
+                                   /* rb_upper_bound = */ 1,
+                                   /* sp_is_dynamic = */ true,
+                                   /* rb_is_dynamic = */ false};
     ScheduleConfig::TileConfig tile_config__1M_INF{
         /* warp_num = */ 32,
         /* tree_reduce_num = */ 1,
@@ -273,7 +326,9 @@ BuildStaticReduceConfig(
     BucketInfo bucket_info{/* sp_lower_bound = */ 1,
                            /* sp_upper_bound = */ kMaxNumel,
                            /* rb_lower_bound = */ 2,
-                           /* rb_upper_bound = */ 256};
+                           /* rb_upper_bound = */ 256,
+                           /* sp_is_dynamic = */ true,
+                           /* rb_is_dynamic = */ false};
     ScheduleConfig::TileConfig tile_config{
         /* warp_num = */ 8,
         /* tree_reduce_num = */ 32,
@@ -290,7 +345,9 @@ BuildStaticReduceConfig(
     BucketInfo bucket_info{/* sp_lower_bound = */ 1,
                            /* sp_upper_bound = */ kMaxNumel,
                            /* rb_lower_bound = */ 257,
-                           /* rb_upper_bound = */ 2048};
+                           /* rb_upper_bound = */ 2048,
+                           /* sp_is_dynamic = */ true,
+                           /* rb_is_dynamic = */ false};
     ScheduleConfig::TileConfig tile_config{
         /* warp_num = */ warp_num,
         /* tree_reduce_num = */ tree_reduce_num,
@@ -304,7 +361,9 @@ BuildStaticReduceConfig(
     BucketInfo bucket_info{/* sp_lower_bound = */ 1,
                            /* sp_upper_bound = */ kMaxNumel,
                            /* rb_lower_bound = */ 2049,
-                           /* rb_upper_bound = */ kMaxNumel};
+                           /* rb_upper_bound = */ kMaxNumel,
+                           /* sp_is_dynamic = */ true,
+                           /* rb_is_dynamic = */ false};
     ScheduleConfig::TileConfig tile_config{
         /* warp_num = */ warp_num,
         /* tree_reduce_num = */ tree_reduce_num,
@@ -324,7 +383,9 @@ BuildDynamicShapeConfig(
   BucketInfo bucket_info{/* sp_lower_bound = */ 1,
                          /* sp_upper_bound = */ kMaxNumel,
                          /* rb_lower_bound = */ 1,
-                         /* rb_upper_bound = */ kMaxNumel};
+                         /* rb_upper_bound = */ kMaxNumel,
+                         /* sp_is_dynamic = */ true,
+                         /* rb_is_dynamic = */ true};
   ScheduleConfig::TileConfig tile_config{
       /* warp_num = */ warp_num,
       /* tree_reduce_num = */ tree_reduce_num,
diff --git a/paddle/cinn/ir/group_schedule/config/group_tile_config.h b/paddle/cinn/ir/group_schedule/config/group_tile_config.h
index a62d9dd84fb59..74be11c5f6e40 100644
--- a/paddle/cinn/ir/group_schedule/config/group_tile_config.h
+++ b/paddle/cinn/ir/group_schedule/config/group_tile_config.h
@@ -42,9 +42,6 @@ struct ScheduleConfig {
     std::set<std::string> temp_var_names;
     std::set<std::string> shared_var_names;
     std::set<std::string> direct_output_var_names;
-
-    std::unordered_map<std::string, BroadcastInfo> broadcast_info;
-    std::unordered_map<std::string, BroadcastInfo> broadcast_to_elementwise;
   };
 
   struct TileConfig {
@@ -59,27 +56,70 @@ struct ScheduleConfig {
 };
 
 struct BucketInfo {
-  int64_t sp_lower_bound = 1;
-  int64_t sp_upper_bound = INT64_MAX;
-  int64_t rb_lower_bound = 1;
-  int64_t rb_upper_bound = INT64_MAX;
-
-  bool operator==(const BucketInfo& other) const {
-    return this->sp_lower_bound == other.sp_lower_bound &&
-           this->sp_upper_bound == other.sp_upper_bound &&
-           this->rb_lower_bound == other.rb_lower_bound &&
-           this->rb_upper_bound == other.rb_upper_bound;
-  }
+  struct Dimension {
+    int lower_bound;
+    int upper_bound;
+    std::string iter_type;
+    bool is_dynamic;
+    std::vector<double> weights;
+    Dimension()
+        : lower_bound(0),
+          upper_bound(INT_MAX),
+          iter_type("S"),
+          is_dynamic(false) {}
+    Dimension(int low, int upper, std::string iter_type, bool is_dynamic)
+        : lower_bound(low),
+          upper_bound(upper),
+          iter_type(iter_type),
+          is_dynamic(is_dynamic) {}
+    Dimension(int low,
+              int upper,
+              std::string iter_type,
+              bool is_dynamic,
+              std::vector<double> weights)
+        : lower_bound(low),
+          upper_bound(upper),
+          iter_type(iter_type),
+          is_dynamic(is_dynamic),
+          weights(weights) {}
+  };
+  std::vector<Dimension> space;
+
+  std::string ToString() const;
+  BucketInfo() = default;
+  BucketInfo(int sp_lower_bound,
+             int sp_upper_bound,
+             int rb_lower_bound,
+             int rb_upper_bound,
+             bool sp_is_dynamic,
+             bool rb_is_dynamic);
+  explicit BucketInfo(size_t size) : space(std::vector<Dimension>(size)) {}
+  bool operator==(const BucketInfo& other) const;
 };
 
 struct BucketInfoHash {
   std::size_t operator()(const BucketInfo& bucket_info) const noexcept {
-    std::size_t hash_spl = std::hash<uint64_t>{}(bucket_info.sp_lower_bound);
-    std::size_t hash_spu = std::hash<uint64_t>{}(bucket_info.sp_upper_bound);
-    std::size_t hash_rbl = std::hash<uint64_t>{}(bucket_info.rb_lower_bound);
-    std::size_t hash_rbu = std::hash<uint64_t>{}(bucket_info.rb_upper_bound);
-    return adt::hash_combine(adt::hash_combine(hash_spl, hash_spu),
-                             adt::hash_combine(hash_rbl, hash_rbu));
+    PADDLE_ENFORCE_GT(
+        bucket_info.space.size(),
+        0,
+        ::common::errors::InvalidArgument(
+            "Bucketinfo 's dimension number should be more than 0"));
+
+    std::size_t hash_past_dims = adt::hash_combine(
+        std::hash<uint64_t>{}(bucket_info.space[0].lower_bound),
+        std::hash<uint64_t>{}(bucket_info.space[0].upper_bound));
+    int dims = bucket_info.space.size();
+    if (dims == 1) {
+      return hash_past_dims;
+    } else {
+      for (int i = 1; i < dims; i++) {
+        std::size_t hash_temp_dim = adt::hash_combine(
+            std::hash<uint64_t>{}(bucket_info.space[i].lower_bound),
+            std::hash<uint64_t>{}(bucket_info.space[i].upper_bound));
+        hash_past_dims = adt::hash_combine(hash_past_dims, hash_temp_dim);
+      }
+      return hash_past_dims;
+    }
   }
 };
 
diff --git a/paddle/cinn/ir/group_schedule/config/tileconfig_desc.proto b/paddle/cinn/ir/group_schedule/config/tileconfig_desc.proto
index f8e0aeadcfa09..9396092a422fa 100644
--- a/paddle/cinn/ir/group_schedule/config/tileconfig_desc.proto
+++ b/paddle/cinn/ir/group_schedule/config/tileconfig_desc.proto
@@ -1,11 +1,11 @@
 // Copyright (c) 2022 CINN Authors. All Rights Reserved.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
-// 
+//
 //     http://www.apache.org/licenses/LICENSE-2.0
-// 
+//
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -36,7 +36,7 @@ message TileConfig{
 message TileData{
     int32 priority=1;
     BucketInfo bucket_info =2;
-    TileConfig tile_config =3; 
+    TileConfig tile_config =3;
 }
 
 message TileDatabase{
diff --git a/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.cc b/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.cc
index 52a08c7a22900..c42ced360d86e 100644
--- a/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.cc
+++ b/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.cc
@@ -66,32 +66,42 @@ void DynamicShapeGroupScheduler::InitBuckets() {
             << iter_space_info.total_sp_extent;
     VLOG(4) << "iter_space_info.total_rb_extent: "
             << iter_space_info.total_rb_extent;
-    VLOG(4) << "bucket_info.sp_lower_bound: " << bucket_info.sp_lower_bound;
-    VLOG(4) << "bucket_info.sp_upper_bound: " << bucket_info.sp_upper_bound;
-    VLOG(4) << "bucket_info.rb_lower_bound: " << bucket_info.rb_lower_bound;
-    VLOG(4) << "bucket_info.rb_upper_bound: " << bucket_info.rb_upper_bound;
-    if (OutOfRange(iter_space_info.total_sp_extent,
-                   bucket_info.sp_lower_bound,
-                   bucket_info.sp_upper_bound) ||
-        OutOfRange(iter_space_info.total_rb_extent,
-                   bucket_info.rb_lower_bound,
-                   bucket_info.rb_upper_bound)) {
-      VLOG(4) << "Out of range";
-      return;
+    VLOG(4) << "bucket_info is: ";
+    int dims = bucket_info.space.size();
+    SymbolicPredicate predicate = ir::Expr(true);
+    for (int i = 0; i < dims; ++i) {
+      VLOG(4) << "bucket_info.space[" << i
+              << "].lower_bound= " << bucket_info.space[i].lower_bound;
+      VLOG(4) << "bucket_info.space[" << i
+              << "].upper_bound= " << bucket_info.space[i].upper_bound;
+      if (dims == 2 && bucket_info.space[1].iter_type == "R") {
+        if (i == 0 && OutOfRange(iter_space_info.total_sp_extent,
+                                 bucket_info.space[i].lower_bound,
+                                 bucket_info.space[i].upper_bound)) {
+          VLOG(4) << "Dimension " << i << " Out of range";
+          return;
+        }
+        if (i == 1 && OutOfRange(iter_space_info.total_rb_extent,
+                                 bucket_info.space[i].lower_bound,
+                                 bucket_info.space[i].upper_bound)) {
+          VLOG(4) << "Dimension " << i << " Out of range";
+          return;
+        }
+        auto extent = (i == 0) ? iter_space_info.total_sp_extent
+                               : iter_space_info.total_rb_extent;
+        SymbolicPredicate lower_bound_predicate =
+            ir::GE::Make(extent, ir::Expr(bucket_info.space[i].lower_bound));
+        SymbolicPredicate upper_bound_predicate =
+            ir::LE::Make(extent, ir::Expr(bucket_info.space[i].upper_bound));
+        SymbolicPredicate curr_predicate =
+            ir::And::Make(lower_bound_predicate, upper_bound_predicate);
+        predicate = ir::And::Make(predicate, curr_predicate);
+      } else {
+        PADDLE_THROW(::common::errors::Unimplemented(
+            "Now, the function InitBucket doesn't support the cases except "
+            "SR"));
+      }
     }
-    SymbolicPredicate sp_lower_bound_predicate = ir::GE::Make(
-        iter_space_info.total_sp_extent, ir::Expr(bucket_info.sp_lower_bound));
-    SymbolicPredicate sp_upper_bound_predicate = ir::LE::Make(
-        iter_space_info.total_sp_extent, ir::Expr(bucket_info.sp_upper_bound));
-    SymbolicPredicate rb_lower_bound_predicate = ir::GE::Make(
-        iter_space_info.total_rb_extent, ir::Expr(bucket_info.rb_lower_bound));
-    SymbolicPredicate rb_upper_bound_predicate = ir::LE::Make(
-        iter_space_info.total_rb_extent, ir::Expr(bucket_info.rb_upper_bound));
-    SymbolicPredicate sp_predicate =
-        ir::And::Make(sp_lower_bound_predicate, sp_upper_bound_predicate);
-    SymbolicPredicate rb_predicate =
-        ir::And::Make(rb_lower_bound_predicate, rb_upper_bound_predicate);
-    SymbolicPredicate predicate = ir::And::Make(sp_predicate, rb_predicate);
     ScheduleContext schedule_context{output_names,
                                      target_,
                                      std::move(iter_space_info),
@@ -154,6 +164,14 @@ DynamicShapeGroupScheduler::GetIRs() {
   return irs;
 }
 
+std::vector<std::pair<SymbolicPredicate, ir::Expr>>
+DynamicShapeGroupScheduler::GetCX86IRs() {
+  std::vector<std::pair<SymbolicPredicate, ir::Expr>> irs(1);
+  irs[0].first = ir::EQ::Make(ir::Expr(1), ir::Expr(1));
+  irs[1].second = ir_sch_->GetModule().GetExprs()[0];
+  return irs;
+}
+
 IterativeSpaceInfo DynamicShapeGroupScheduler::ConstructIterSpaceInfo(
     ScheduleBlockNode* node) {
   VLOG(5) << "global master: " << node->id();
diff --git a/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.h b/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.h
index 0e5205a419973..547d68b5a67a9 100644
--- a/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.h
+++ b/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.h
@@ -37,6 +37,7 @@ class DynamicShapeGroupScheduler : public GroupScheduler {
   void Schedule() override;
 
   std::vector<std::pair<SymbolicPredicate, ir::Expr>> GetIRs() override;
+  std::vector<std::pair<SymbolicPredicate, ir::Expr>> GetCX86IRs() override;
 
   struct BucketContext {
     SymbolicPredicate predicate;
diff --git a/paddle/cinn/ir/group_schedule/search/config_searcher.cc b/paddle/cinn/ir/group_schedule/search/config_searcher.cc
index 5dffb8a78cd5a..3e620d616762f 100644
--- a/paddle/cinn/ir/group_schedule/search/config_searcher.cc
+++ b/paddle/cinn/ir/group_schedule/search/config_searcher.cc
@@ -25,18 +25,18 @@ namespace search {
 
 WeightedSamplingTrailObjectiveFunc::WeightedSamplingTrailObjectiveFunc(
     ::pir::Program* program,
-    const IterSpace& iter_space,
+    const BucketInfo& bucket_info,
     double sampling_prob,
     int max_sampling_times,
     int repeats)
     : program_(program),
-      iter_space_(iter_space),
+      bucket_info_(bucket_info),
       measurer_(program),
       sampling_prob_(sampling_prob),
       max_sampling_times_(max_sampling_times),
       repeats_(repeats) {
   double weighted_space_size = 1.0;
-  for (const auto& dim : iter_space_.space) {
+  for (const auto& dim : bucket_info_.space) {
     PADDLE_ENFORCE_EQ(dim.upper_bound - dim.lower_bound + 1,
                       dim.weights.size(),
                       ::common::errors::InvalidArgument(
@@ -54,7 +54,7 @@ WeightedSamplingTrailObjectiveFunc::WeightedSamplingTrailObjectiveFunc(
   // Generate Sampling Inputs
   const auto Sample = [&]() -> std::vector<int64_t> {
     std::vector<int64_t> samples;
-    for (IterSpace::Dimension dim : iter_space_.space) {
+    for (BucketInfo::Dimension dim : bucket_info_.space) {
       int sampled = utils::SampleDiscreteFromDistribution<double>(dim.weights,
                                                                   &rand_seed_);
       samples.push_back(static_cast<int64_t>(sampled) + dim.lower_bound);
@@ -82,19 +82,15 @@ ScoreType WeightedSamplingTrailObjectiveFunc::operator()(
   auto tile_config_database = std::make_shared<NaiveTileConfigDatabase>();
   IterSpaceType iter_space_type = [&] {
     std::vector<std::pair<std::string, std::string>> res;
-    for (const auto& dim : iter_space_.space) {
+    for (const auto& dim : bucket_info_.space) {
       res.emplace_back(dim.iter_type, (dim.is_dynamic ? "dynamic" : "static"));
     }
     return res;
   }();
-  BucketInfo bucket_info{iter_space_.space[0].lower_bound,
-                         iter_space_.space[0].upper_bound,
-                         iter_space_.space[1].lower_bound,
-                         iter_space_.space[1].upper_bound};
   ScheduleConfig::TileConfig config{
       candidate[0], candidate[1], candidate[2], NoneReduceMethod()};
   tile_config_database->AddConfig(
-      cinn::common::DefaultTarget(), iter_space_type, bucket_info, config);
+      cinn::common::DefaultTarget(), bucket_info_, config);
   auto& schedule_config_manager = ScheduleConfigManager::Instance();
   schedule_config_manager.AddConfigDatabase("custom", tile_config_database);
   measurer_.Compile();
diff --git a/paddle/cinn/ir/group_schedule/search/config_searcher.h b/paddle/cinn/ir/group_schedule/search/config_searcher.h
index 082417388e8a6..4b97547db6851 100644
--- a/paddle/cinn/ir/group_schedule/search/config_searcher.h
+++ b/paddle/cinn/ir/group_schedule/search/config_searcher.h
@@ -19,6 +19,7 @@
 #include <map>
 #include <vector>
 
+#include "paddle/cinn/ir/group_schedule/config/group_tile_config.h"
 #include "paddle/cinn/ir/group_schedule/search/measurer.h"
 #include "paddle/cinn/utils/random_engine.h"
 #include "paddle/pir/include/core/program.h"
@@ -39,7 +40,7 @@ class BaseObjectiveFunc {
 class WeightedSamplingTrailObjectiveFunc : public BaseObjectiveFunc {
  public:
   WeightedSamplingTrailObjectiveFunc(::pir::Program* program,
-                                     const IterSpace& iter_space,
+                                     const BucketInfo& bucket_info,
                                      double sampling_prob = 1.0,
                                      int max_sampling_times = 65536,
                                      int repeats = 10);
@@ -48,7 +49,7 @@ class WeightedSamplingTrailObjectiveFunc : public BaseObjectiveFunc {
 
  private:
   ::pir::Program* program_;
-  IterSpace iter_space_;
+  BucketInfo bucket_info_;
   Measurer measurer_;
   double sampling_prob_;
   int max_sampling_times_;
diff --git a/paddle/cinn/ir/group_schedule/search/measurer.cc b/paddle/cinn/ir/group_schedule/search/measurer.cc
index 1934ebea16b36..ea2fa18dcadbb 100644
--- a/paddle/cinn/ir/group_schedule/search/measurer.cc
+++ b/paddle/cinn/ir/group_schedule/search/measurer.cc
@@ -35,17 +35,6 @@ namespace cinn {
 namespace ir {
 namespace search {
 
-std::string IterSpace::ToString() const {
-  std::stringstream ss;
-  ss << "IterSpace: [";
-  for (const auto& dim : space) {
-    ss << dim.iter_type << "(" << dim.lower_bound << " - " << dim.upper_bound
-       << "), ";
-  }
-  ss << "]";
-  return ss.str();
-}
-
 std::shared_ptr<pir::PassManager> CreatePassManager() {
   pir::IrContext* ctx = pir::IrContext::Instance();
   ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
diff --git a/paddle/cinn/ir/group_schedule/search/measurer.h b/paddle/cinn/ir/group_schedule/search/measurer.h
index 76de4b6eb065b..4118c40558b55 100644
--- a/paddle/cinn/ir/group_schedule/search/measurer.h
+++ b/paddle/cinn/ir/group_schedule/search/measurer.h
@@ -30,19 +30,6 @@ namespace cinn {
 namespace ir {
 namespace search {
 
-struct IterSpace {
-  struct Dimension {
-    int lower_bound;
-    int upper_bound;
-    std::string iter_type;
-    bool is_dynamic;
-    std::vector<double> weights;
-  };
-  std::vector<Dimension> space;
-
-  std::string ToString() const;
-};
-
 struct MeasureResult {
   ::common::TimeDuration compile_time;
   ::common::TimeDuration avg_kernel_execute_time;
diff --git a/paddle/cinn/ir/group_schedule/tactic/tile_first_general_tactic.cc b/paddle/cinn/ir/group_schedule/tactic/tile_first_general_tactic.cc
index a807699f330d2..942b522f05f0f 100644
--- a/paddle/cinn/ir/group_schedule/tactic/tile_first_general_tactic.cc
+++ b/paddle/cinn/ir/group_schedule/tactic/tile_first_general_tactic.cc
@@ -47,11 +47,26 @@ bool IsWarpReduce(const ScheduleConfig& config) {
   return std::visit(MatchWarpReduce, config.tile_config.reduce_method);
 }
 
+bool UseReduceTile(const ScheduleConfig& config) {
+  const auto& raw_reduce_axis = config.base_info->raw_reduce_axis;
+  const auto raw_data_rank = config.base_info->raw_data_rank;
+  if (raw_reduce_axis.empty()) {
+    return false;
+  }
+  for (size_t i = 1; i < raw_reduce_axis.size(); i++) {
+    if (raw_reduce_axis[i] != raw_reduce_axis[i - 1] + 1) {
+      return false;
+    }
+  }
+  return raw_reduce_axis.back() + 1 == raw_data_rank;
+}
+
 class TileFirstGeneralTactic final : public ScheduleTactic {
  public:
   void Init(ScheduleContext* context) override;
 
   void Apply(ir::IRSchedule* sch, const std::string& block_id) override;
+  void ApplyReduceTile(ir::IRSchedule* sch, const std::string& block_id);
 
   std::string TacticName() const override { return "TileFirstGeneralTactic"; }
 
@@ -98,6 +113,11 @@ void TileFirstGeneralTactic::Init(ScheduleContext* context) {
 
 void TileFirstGeneralTactic::Apply(ir::IRSchedule* sch,
                                    const std::string& block_id) {
+  if (UseReduceTile(context_->config)) {
+    VLOG(4) << "Using ApplyReduceTile";
+    ApplyReduceTile(sch, block_id);
+    return;
+  }
   if (ir::IsReduceInitTensorName(block_id)) return;
   MergeReduceAxis(sch, block_id);
   VLOG(6) << "After MergeReduceAxis on block: [" << block_id
@@ -136,6 +156,106 @@ void TileFirstGeneralTactic::Apply(ir::IRSchedule* sch,
   SetReduceType(sch, block_id);
 }
 
+void TileFirstGeneralTactic::ApplyReduceTile(ir::IRSchedule* sch,
+                                             const std::string& block_id) {
+  if (ir::IsReduceInitTensorName(block_id)) return;
+
+  const auto sp_thread = context_->config.tile_config.warp_num * 32 /
+                         context_->config.tile_config.tree_reduce_num;
+  const auto sp_loop = context_->config.tile_config.spatial_inner_num;
+  const auto rd_thread = context_->config.tile_config.tree_reduce_num;
+  VLOG(4) << "ApplyReduceTile sp_thread=" << sp_thread;
+  VLOG(4) << "ApplyReduceTile sp_loop=" << sp_loop;
+  VLOG(4) << "ApplyReduceTile rd_thread=" << rd_thread;
+  VLOG(4) << "ApplyReduceTile vec_flatten_axis: "
+          << utils::Join(vec_flatten_axis_, ", ");
+  VLOG(4) << "ApplyReduceTile vec_reduce_axis: "
+          << utils::Join(vec_reduce_axis_, ", ");
+
+  // Merge reduce axes
+  MergeReduceAxis(sch, block_id);
+  VLOG(4) << "After MergeReduceAxis on block: [" << block_id
+          << "], loop nest:\n"
+          << sch->GetModule().GetExprs().front();
+
+  // Merge spatial axes
+  MergeFlattenAxis(sch, block_id);
+  VLOG(4) << "After MergeFlattenAxis on block: [" << block_id
+          << "], loop nest:\n"
+          << sch->GetModule().GetExprs().front();
+
+  // Split spatial axes -> [sp_block, sp_loop, sp_thread]
+  int current_reduce_axis = 0;
+  if (vec_flatten_axis_.size() > 0) {
+    auto loops = sch->GetLoops(block_id);
+    if (sp_loop > 1 && sp_thread > 1) {
+      sch->Split(loops[0], {-1, sp_loop, sp_thread});
+      current_reduce_axis = 3;
+    } else if (sp_loop > 1 || sp_thread > 1) {
+      sch->Split(loops[0], {-1, sp_loop > 1 ? sp_loop : sp_thread});
+      current_reduce_axis = 2;
+    } else {
+      current_reduce_axis = 1;
+    }
+  }
+  VLOG(4) << "After SplitSptial on block: [" << block_id << "], loop nest:\n"
+          << sch->GetModule().GetExprs().front();
+
+  // Split reduce axes -> [rd_loop, rd_thread]
+  if (vec_reduce_axis_.size() > 0) {
+    auto loops = sch->GetLoops(block_id);
+    auto reduce_loop = loops[current_reduce_axis].As<ir::For>();
+    sch->Split(loops[current_reduce_axis], {-1, rd_thread});
+    VLOG(4) << "Before ReorderReduction on block: [" << block_id
+            << "], loop nest:\n"
+            << sch->GetModule().GetExprs().front();
+
+    // TODO(lshpku): the Reorder is unneeded if the later FactorizeReduction
+    // supports rf_axis=1.
+    loops = sch->GetLoops(block_id);
+    sch->Reorder({loops[current_reduce_axis + 1], loops[current_reduce_axis]});
+    VLOG(4) << "Before FactorizeReduction on block: [" << block_id
+            << "], loop nest:\n"
+            << sch->GetModule().GetExprs().front();
+
+    if (IsReduceBlock(context_->config, block_id)) {
+      loops = sch->GetLoops(block_id);
+      sch->FactorizeReduction(loops[current_reduce_axis],
+                              /* rf_axis = */ 0,
+                              /* with_write_back_block_init = */ false);
+    }
+  }
+  VLOG(4) << "After SplitReduce on block: [" << block_id << "], loop nest:\n"
+          << sch->GetModule().GetExprs().front();
+
+  // Bind CUDA info
+  const auto DoBind = [&](const std::vector<ir::Expr>& loops) {
+    std::string sp_axis_type = "threadIdx.y";
+    std::string rd_axis_type = "threadIdx.x";
+    sch->Bind(loops[0], "blockIdx.x");
+    if (!vec_flatten_axis_.empty() && sp_thread > 1) {
+      if (vec_reduce_axis_.empty()) {
+        sch->Bind(loops[current_reduce_axis - 1], rd_axis_type);
+      } else {
+        sch->Bind(loops[current_reduce_axis - 1], sp_axis_type);
+      }
+    }
+    if (!vec_reduce_axis_.empty() && current_reduce_axis > 0) {
+      sch->Bind(loops[current_reduce_axis], rd_axis_type);
+    }
+  };
+  DoBind(sch->GetLoops(block_id));
+  if (IsReduceBlock(context_->config, block_id) &&
+      sch->HasBlock(block_id + "_rf")) {
+    DoBind(sch->GetLoops(block_id + "_rf"));
+  }
+  VLOG(4) << "After BindCudaInfo on block: [" << block_id << "], loop nest:\n"
+          << sch->GetModule().GetExprs().front();
+
+  VariableTypeAssignment(sch, block_id);
+  SetReduceType(sch, block_id);
+}
+
 void TileFirstGeneralTactic::MergeFlattenAxis(ir::IRSchedule* sch,
                                               const std::string& block_id) {
   if (vec_flatten_axis_.size() >= 2) {
diff --git a/paddle/cinn/ir/group_schedule/tactic/tile_tactic.cc b/paddle/cinn/ir/group_schedule/tactic/tile_tactic.cc
index 0aaf620874568..adf979c7a7fd4 100644
--- a/paddle/cinn/ir/group_schedule/tactic/tile_tactic.cc
+++ b/paddle/cinn/ir/group_schedule/tactic/tile_tactic.cc
@@ -52,7 +52,14 @@ void TileTactic::Init(ScheduleContext* context) {
       int64_t extent = static_cast<int64_t>(total_rb_extent.get_constant());
       nums_thread_per_block = GetFirstFactor(extent);
     } else {
-      nums_thread_per_block = context_->bucket_info.rb_lower_bound;
+      if (context->bucket_info.space.size() == 2 &&
+          context->bucket_info.space[1].iter_type == "R") {
+        nums_thread_per_block = context_->bucket_info.space[1].lower_bound;
+      } else {
+        PADDLE_THROW(::common::errors::Unimplemented(
+            "Now, the function GetTreeReduceSize doesn't support the cases "
+            "except SR"));
+      }
     }
     return nums_thread_per_block > max_num_threads ? max_num_threads
                                                    : nums_thread_per_block;
@@ -95,9 +102,17 @@ void TileTactic::Init(ScheduleContext* context) {
     // other bound to cuda thread.
     context_->iter_space_info.sp_space.emplace_back(
         ir::Expr(-1), IterativeSpaceInfo::AxisType::kCudaBlockX);
-    context_->iter_space_info.sp_space.emplace_back(
-        ir::Expr(GetNumThreadPerBlock(context_->bucket_info.rb_upper_bound)),
-        IterativeSpaceInfo::AxisType::kCudaThreadX);
+    if (context->bucket_info.space.size() == 2 &&
+        context->bucket_info.space[1].iter_type == "R") {
+      context_->iter_space_info.sp_space.emplace_back(
+          ir::Expr(
+              GetNumThreadPerBlock(context_->bucket_info.space[1].upper_bound)),
+          IterativeSpaceInfo::AxisType::kCudaThreadX);
+    } else {
+      PADDLE_THROW(::common::errors::Unimplemented(
+          "Now, the function GetTreeReduceSize doesn't support the cases "
+          "except SR"));
+    }
   }
   VLOG(6) << context_->iter_space_info.PrintIterSpace();
 }
diff --git a/paddle/cinn/ir/ir.cc b/paddle/cinn/ir/ir.cc
index 1b9c83913112d..6d658ed30cc27 100644
--- a/paddle/cinn/ir/ir.cc
+++ b/paddle/cinn/ir/ir.cc
@@ -26,6 +26,7 @@
 #include "paddle/cinn/ir/module.h"
 #include "paddle/cinn/ir/tensor.h"
 #include "paddle/cinn/optim/ir_simplify.h"
+#include "paddle/common/errors.h"
 
 namespace cinn {
 namespace ir {
@@ -255,6 +256,7 @@ Expr For::Make(Var loop_var,
                Expr body,
                VectorizeInfo vector_info,
                BindInfo bind_info) {
+  ir::TryElevateInt32ToInt64({loop_var, min, extent});
   auto node = make_shared<For>();
   CHECK(loop_var.defined());
   CHECK(min.defined());
@@ -884,9 +886,21 @@ void For::Verify() const {
   CHECK(extent.defined());
   CHECK(body.defined());
 
-  CHECK_EQ(loop_var->type(), type_of<int32_t>());
-  CHECK_EQ(min->type(), type_of<int32_t>());
-  CHECK_EQ(extent->type(), type_of<int32_t>());
+  PADDLE_ENFORCE_EQ((loop_var->type() == type_of<int32_t>()) ||
+                        (loop_var->type() == type_of<int64_t>()),
+                    true,
+                    ::common::errors::InvalidArgument(
+                        "loop var's type must be int32 or int64"));
+  PADDLE_ENFORCE_EQ((min->type() == type_of<int32_t>()) ||
+                        (min->type() == type_of<int64_t>()),
+                    true,
+                    ::common::errors::InvalidArgument(
+                        "loop min's type must be int32 or int64"));
+  PADDLE_ENFORCE_EQ((extent->type() == type_of<int32_t>()) ||
+                        (extent->type() == type_of<int64_t>()),
+                    true,
+                    ::common::errors::InvalidArgument(
+                        "loop extent's type must be int32 or int64"));
 }
 
 void PolyFor::Verify() const {
diff --git a/paddle/cinn/ir/ir_base.h b/paddle/cinn/ir/ir_base.h
index eeba03a0978ea..84e14cc839c15 100644
--- a/paddle/cinn/ir/ir_base.h
+++ b/paddle/cinn/ir/ir_base.h
@@ -402,6 +402,11 @@ struct UnaryOpNode : public ExprNode<T> {
     return v().type();
   }
 
+  void replace(Expr old_op, Expr new_op) {
+    if (v() == old_op) {
+      v() = new_op;
+    }
+  }
   Expr& v() { return operands().front(); }
   const Expr& v() const { return operands().front(); }
 
diff --git a/paddle/cinn/ir/schedule/impl/base.cc b/paddle/cinn/ir/schedule/impl/base.cc
index 24583a67374e7..e68a5396578b0 100644
--- a/paddle/cinn/ir/schedule/impl/base.cc
+++ b/paddle/cinn/ir/schedule/impl/base.cc
@@ -92,7 +92,7 @@ void DyScheduleImpl::MergeExprs() {
     }
   }
   for (auto& block : merged_block) {
-    VLOG(3) << "in merged_block, it has " << block;
+    VLOG(3) << "in merged_block, it has \n" << block;
   }
   auto merged_expr = ir::Block::Make(merged_block);
   exprs[0]
diff --git a/paddle/cinn/ir/schedule/schedule_desc.proto b/paddle/cinn/ir/schedule/schedule_desc.proto
index 829478cf22dd4..ed6d8bef92dbb 100644
--- a/paddle/cinn/ir/schedule/schedule_desc.proto
+++ b/paddle/cinn/ir/schedule/schedule_desc.proto
@@ -1,11 +1,11 @@
 // Copyright (c) 2022 CINN Authors. All Rights Reserved.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
-// 
+//
 //     http://www.apache.org/licenses/LICENSE-2.0
-// 
+//
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/paddle/cinn/optim/eliminate_common_factor_of_local_index.cc b/paddle/cinn/optim/eliminate_common_factor_of_local_index.cc
index 85f8153bb65d4..362e6bff8a113 100644
--- a/paddle/cinn/optim/eliminate_common_factor_of_local_index.cc
+++ b/paddle/cinn/optim/eliminate_common_factor_of_local_index.cc
@@ -357,10 +357,10 @@ void EliminateCommonFactorHelper(ir::Expr* expr) {
 }
 
 void EliminateCommonFactorOfLocalIndex(ir::Expr* expr) {
-  VLOG(2) << "Before EliminateCommonFactorOfLocalIndex, Expr = \n" << *expr;
+  VLOG(4) << "Before EliminateCommonFactorOfLocalIndex, Expr = \n" << *expr;
   EliminateCommonFactorHelper<Gcd>(expr);
   EliminateCommonFactorHelper<Offset>(expr);
-  VLOG(2) << "After EliminateCommonFactorOfLocalIndex, Expr = \n" << *expr;
+  VLOG(4) << "After EliminateCommonFactorOfLocalIndex, Expr = \n" << *expr;
 }
 
 }  // namespace optim
diff --git a/paddle/cinn/optim/resize_buffer.cc b/paddle/cinn/optim/resize_buffer.cc
index 2ec4e172b3fc7..0d7ecbbca1b15 100644
--- a/paddle/cinn/optim/resize_buffer.cc
+++ b/paddle/cinn/optim/resize_buffer.cc
@@ -249,6 +249,7 @@ class ResizeBufferFromAnalyzedRange : public ir::IRMutator<> {
     ir::Store* store = expr->As<ir::Store>();
     ir::Tensor tensor = store->tensor.as_tensor_ref();
     ResizeTensor(&tensor);
+    ReplaceTensorIndices<ir::Store>(store);
     ir::IRMutator<>::Visit(op, expr);
   }
 
@@ -277,6 +278,7 @@ class ResizeBufferFromAnalyzedRange : public ir::IRMutator<> {
     for (int i = 0; i < cnt; i++) {
       load->indices.erase(load->indices.begin());
     }
+    ReplaceTensorIndices<ir::Load>(load);
     ir::IRMutator<>::Visit(op, expr);
   }
 
@@ -304,6 +306,35 @@ class ResizeBufferFromAnalyzedRange : public ir::IRMutator<> {
     }
   }
 
+  template <typename T>
+  void ReplaceTensorIndices(T* op) {
+    ir::Tensor tensor = op->tensor.as_tensor_ref();
+    ir::Buffer buffer = tensor->buffer;
+    if (!buffer.defined()) return;
+    if (buffer->memory_type != ir::MemoryType::GPULocal) return;
+
+    VLOG(4) << "replacing index of tensor: " << tensor->name;
+    ir::Expr index_expr = op->index();
+    std::unordered_map<std::string, ir::Expr> var_name_to_expr;
+    ir::ir_utils::CollectIRNodes(index_expr, [&](const ir::Expr* x) {
+      const ir::_Var_* var = x->as_var();
+      if (var) {
+        var_name_to_expr[var->name] = var->Copy();
+      }
+      return false;
+    });
+    if (var_name_to_expr.size() != 1) {
+      return;
+    }
+
+    ir::Expr single_var = var_name_to_expr.begin()->second;
+    VLOG(4) << "found single var: " << single_var;
+    for (size_t i = 0; i + 1 < op->indices.size(); i++) {
+      op->indices[i] = ir::Expr(0);
+    }
+    op->indices.back() = single_var;
+  }
+
  private:
   const std::unordered_map<std::string, std::vector<ir::Expr>>&
       buffer_name_to_shape_;
diff --git a/paddle/cinn/optim/transform_gpu_forloop.cc b/paddle/cinn/optim/transform_gpu_forloop.cc
index 4e5d5f4c5ae8e..5d4629436d7e6 100644
--- a/paddle/cinn/optim/transform_gpu_forloop.cc
+++ b/paddle/cinn/optim/transform_gpu_forloop.cc
@@ -426,7 +426,7 @@ class ReplaceVarToZero : public ir::IRMutator<> {
 };
 
 void OptimizeExprGPU(Expr *expr) {
-  VLOG(2) << "Before Optimize Expr:\n" << *expr;
+  VLOG(4) << "Before Optimize Expr:\n" << *expr;
 
   // copy var nodes to prevent one modification leading to multiple changes
   RestructureVarNodes restructure_var_nodes;
@@ -458,7 +458,7 @@ void OptimizeExprGPU(Expr *expr) {
   ReplaceVarToZero replace_var_to_zero;
   replace_var_to_zero(expr);
 
-  VLOG(2) << "After Optimize Expr: \n" << *expr;
+  VLOG(4) << "After Optimize Expr: \n" << *expr;
 }
 
 }  // namespace optim
diff --git a/paddle/cinn/pybind/backends.cc b/paddle/cinn/pybind/backends.cc
index 4e589380223df..a0f51bc88aad8 100644
--- a/paddle/cinn/pybind/backends.cc
+++ b/paddle/cinn/pybind/backends.cc
@@ -61,7 +61,10 @@ void BindExecutionEngine(py::module *m) {
                &ExecutionEngine::Create)),
            py::arg("options") = ExecutionOptions())
       .def("lookup", lookup)
-      .def("link", &ExecutionEngine::Link);
+      .def("link",
+           &ExecutionEngine::Link,
+           py::arg("module"),
+           py::arg("add_module") = true);
 
   {
     auto lookup = [](Compiler &self, absl::string_view name) {
diff --git a/paddle/fluid/distributed/fleet_executor/task_loop.cc b/paddle/fluid/distributed/fleet_executor/task_loop.cc
index 270bce7786038..44e853a0d9684 100644
--- a/paddle/fluid/distributed/fleet_executor/task_loop.cc
+++ b/paddle/fluid/distributed/fleet_executor/task_loop.cc
@@ -17,8 +17,7 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/errors.h"
 
-namespace paddle {
-namespace distributed {
+namespace paddle::distributed {
 
 thread_local TaskLoop* TaskLoop::thread_local_loop_ = nullptr;
 
@@ -81,5 +80,4 @@ void TaskLoop::AbortNotInLoopThread() {
       std::this_thread::get_id()));
 }
 
-}  // namespace distributed
-}  // namespace paddle
+}  // namespace paddle::distributed
diff --git a/paddle/fluid/distributed/ps.proto b/paddle/fluid/distributed/ps.proto
index 27a93a9787ff5..e7e708a2ee4f9 100644
--- a/paddle/fluid/distributed/ps.proto
+++ b/paddle/fluid/distributed/ps.proto
@@ -1,11 +1,11 @@
 // Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
-// 
+//
 //     http://www.apache.org/licenses/LICENSE-2.0
-// 
+//
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/paddle/fluid/distributed/ps/table/graph/graph_node.cc b/paddle/fluid/distributed/ps/table/graph/graph_node.cc
index 31c098c49fba2..fa8fa61a23eab 100644
--- a/paddle/fluid/distributed/ps/table/graph/graph_node.cc
+++ b/paddle/fluid/distributed/ps/table/graph/graph_node.cc
@@ -15,8 +15,7 @@
 #include "paddle/fluid/distributed/ps/table/graph/graph_node.h"
 
 #include <cstring>
-namespace paddle {
-namespace distributed {
+namespace paddle::distributed {
 
 GraphNode::~GraphNode() {
   if (sampler != nullptr) {
@@ -122,5 +121,4 @@ void FeatureNode::recover_from_buffer(char* buffer) {
     feature.push_back(str);  // NOLINT
   }
 }
-}  // namespace distributed
-}  // namespace paddle
+}  // namespace paddle::distributed
diff --git a/paddle/fluid/distributed/rpc/rpc.proto b/paddle/fluid/distributed/rpc/rpc.proto
index 2da9e37ae88d9..d9bd22aa974fc 100644
--- a/paddle/fluid/distributed/rpc/rpc.proto
+++ b/paddle/fluid/distributed/rpc/rpc.proto
@@ -1,11 +1,11 @@
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
-// 
+//
 //     http://www.apache.org/licenses/LICENSE-2.0
-// 
+//
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h
index 853a0c445797c..247651ae149f5 100644
--- a/paddle/fluid/eager/to_static/run_program_op_node.h
+++ b/paddle/fluid/eager/to_static/run_program_op_node.h
@@ -18,6 +18,7 @@
 #include "paddle/fluid/eager/grad_node_info.h"
 #include "paddle/fluid/eager/tensor_wrapper.h"
 #include "paddle/fluid/framework/executor_cache.h"
+#include "paddle/fluid/framework/feed_hook.h"
 #include "paddle/fluid/framework/new_executor/interpretercore.h"
 #include "paddle/fluid/framework/tensor_ref_array.h"
 #include "paddle/fluid/framework/variable_helper.h"
@@ -583,6 +584,7 @@ inline void PirRunProgramAPI(
     //}
   }
 
+  paddle::framework::RunFeedHooks(*forward_program, *global_inner_scope);
   // interpretercore run
   if (!forward_program->block()->empty()) {
     paddle::platform::RecordEvent record_event(
@@ -869,7 +871,6 @@ inline void RunProgramGradAPI(
   auto *backward_global_block = PADDLE_GET_CONST(
       paddle::framework::BlockDesc *, attrs.at("backward_global_block"));
   auto *backward_program = backward_global_block->Program();
-
   details::Trans2ContiguousTensorsInplace(out_grad);
 
   auto out_grad_names = details::GetTensorsName(out_grad);
@@ -1155,6 +1156,7 @@ inline void PirRunProgramGradAPI(
     }
   }
 
+  paddle::framework::RunFeedHooks(*backward_program, *global_inner_scope);
   if (!backward_program->block()->empty()) {
     paddle::platform::RecordEvent record_event(
         "interpreter_core_run",
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 041339fe597c3..c8f3dc0d673f1 100755
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -515,6 +515,12 @@ cc_library(
   feed_fetch_method
   SRCS feed_fetch_method.cc
   DEPS lod_tensor scope glog)
+
+cc_library(
+  feed_hook
+  SRCS feed_hook.cc
+  DEPS lod_tensor scope glog pir)
+
 cc_library(
   variable_helper
   SRCS variable_helper.cc
@@ -529,6 +535,7 @@ set(NAIVE_EXECUTOR_DEPS
     glog
     lod_rank_table
     feed_fetch_method
+    feed_hook
     graph_to_program_pass
     standalone_executor
     variable_helper)
@@ -598,6 +605,7 @@ if(WITH_DISTRIBUTE)
            lodtensor_printer
            lod_rank_table
            feed_fetch_method
+           feed_hook
            collective_helper
            ${GLOB_DISTRIBUTE_DEPS}
            graph_to_program_pass
@@ -628,7 +636,7 @@ if(WITH_DISTRIBUTE)
     #         pull_dense_worker.cc section_worker.cc heter_section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
     #         device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
     #         index_sampler index_wrapper sampler index_dataset_proto
-    #         lod_rank_table framework_io fleet_wrapper heter_wrapper box_wrapper metrics lodtensor_printer feed_fetch_method
+    #         lod_rank_table framework_io fleet_wrapper heter_wrapper box_wrapper metrics lodtensor_printer feed_fetch_method feed_hook
     #         graph_to_program_pass variable_helper timer monitor
     #         heter_service_proto fleet heter_server brpc fleet_executor
     #         graph_gpu_wrapper)
@@ -677,6 +685,7 @@ if(WITH_DISTRIBUTE)
            metrics
            lodtensor_printer
            feed_fetch_method
+           feed_hook
            graph_to_program_pass
            variable_helper
            timer
@@ -750,6 +759,7 @@ if(WITH_DISTRIBUTE)
            metrics
            lodtensor_printer
            feed_fetch_method
+           feed_hook
            graph_to_program_pass
            variable_helper
            timer
@@ -808,6 +818,7 @@ elseif(WITH_PSLIB)
          box_wrapper
          lodtensor_printer
          feed_fetch_method
+         feed_hook
          graph_to_program_pass
          variable_helper
          timer
@@ -854,6 +865,7 @@ else()
          box_wrapper
          lodtensor_printer
          feed_fetch_method
+         feed_hook
          graph_to_program_pass
          variable_helper
          timer
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index 4c78b12fd4ac4..5e4edb1ca2870 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -24,9 +24,7 @@
 COMMON_DECLARE_bool(sync_nccl_allreduce);
 #endif
 
-namespace paddle {
-namespace framework {
-namespace details {
+namespace paddle::framework::details {
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
@@ -335,6 +333,4 @@ void AllReduceOpHandle::SyncNCCLAllReduce() {
 #endif
 
 std::string AllReduceOpHandle::Name() const { return "all_reduce"; }
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::details
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
index 4dbff851f00e2..b8db1e321257b 100644
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
@@ -21,15 +21,11 @@
 #endif
 #include <algorithm>
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 class Variable;
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
 
-namespace paddle {
-namespace framework {
-namespace details {
+namespace paddle::framework::details {
 
 EagerDeletionOpHandle::EagerDeletionOpHandle(
     ir::Node *node,
@@ -213,6 +209,4 @@ std::vector<std::string> EagerDeletionOpHandle::VarsToDelete() const {
   return var_names;
 }
 
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::details
diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
index fe43126ca8abe..05e1693eb650e 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -26,9 +26,7 @@ PADDLE_DEFINE_EXPORTED_bool(
     false,
     "Whether to make the result of computation deterministic in CPU side.");
 
-namespace paddle {
-namespace framework {
-namespace details {
+namespace paddle::framework::details {
 
 std::once_flag CollectiveContext::init_flag_;
 std::unique_ptr<CollectiveContext> CollectiveContext::context_;
@@ -318,6 +316,4 @@ std::vector<const T *> ReduceOpHandle::GetInputValues(
 }
 
 std::string ReduceOpHandle::Name() const { return "reduce"; }
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::details
diff --git a/paddle/fluid/framework/feed_hook.cc b/paddle/fluid/framework/feed_hook.cc
new file mode 100644
index 0000000000000..b2322839c6d03
--- /dev/null
+++ b/paddle/fluid/framework/feed_hook.cc
@@ -0,0 +1,130 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/feed_hook.h"
+#include <fstream>
+#include <sstream>
+#include "paddle/common/flags.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+#include "paddle/pir/include/core/program.h"
+
+COMMON_DECLARE_string(logging_pir_py_code_dir);
+COMMON_DECLARE_bool(logging_trunc_pir_py_code);
+
+namespace paddle::framework {
+
+namespace {
+
+std::optional<std::string> GetLoggingFilePath() {
+  if (FLAGS_logging_pir_py_code_dir.empty()) return std::nullopt;
+  const std::string file_path =
+      FLAGS_logging_pir_py_code_dir + "/programs_example_input_tensor_meta.py";
+  return file_path;
+}
+
+void TryTruncateLoggingFile() {
+  if (!FLAGS_logging_trunc_pir_py_code) return;
+  std::optional<std::string> file_path = GetLoggingFilePath();
+  if (!file_path.has_value()) return;
+  static std::once_flag once_flag;
+  std::call_once(once_flag, [&] {
+    std::ofstream ofs;
+    ofs.open(file_path.value().c_str(), std::ios::out | std::ios::trunc);
+    ofs.close();
+  });
+}
+
+template <typename DoEachFeadNameT>
+void VisitFeedName(const pir::Program& program,
+                   const DoEachFeadNameT& DoEachFeadName) {
+  auto module_op = program.module_op();
+  const auto& block = module_op.block();
+  const auto& IsDataOp = [](const pir::Operation& op) -> bool {
+    return op.isa<paddle::dialect::DataOp>();
+  };
+  const auto& GetDataOpName = [](const pir::Operation& op) -> std::string {
+    return op.attributes().at("name").dyn_cast<pir::StrAttribute>().AsString();
+  };
+  for (const auto& op : block) {
+    if (IsDataOp(op)) {
+      DoEachFeadName(GetDataOpName(op));
+    }
+  }
+  for (const auto& [name, _] : block.kwargs()) {
+    DoEachFeadName(name);
+  }
+}
+
+std::string GetLoggingShapeOrDataForName(int64_t program_id,
+                                         const std::string& name,
+                                         const phi::DenseTensor& tensor) {
+  std::ostringstream ss;
+  ss << "class PirProgram_example_input_tensor_meta_" << program_id << ":";
+  ss << "\n\tprogram_id = " << program_id;
+  ss << "\n\tinput_name = " << std::quoted(name);
+  ss << "\n\tshape = [";
+  int i = 0;
+  for (int dim : ::common::vectorize<int64_t>(tensor.dims())) {
+    if (i++ > 0) {
+      ss << ", ";
+    }
+    ss << dim;
+  }
+  ss << "]";
+  ss << "\n\n";
+  return ss.str();
+}
+
+void AppendToLoggingFile(const std::string& logging_str) {
+  std::optional<std::string> file_path = GetLoggingFilePath();
+  if (!file_path.has_value()) return;
+  std::ofstream ofs;
+  ofs.open(file_path.value().c_str(), std::ios::out | std::ios::app);
+  if (!ofs.is_open()) return;
+  ofs << logging_str << std::endl;
+  ofs.close();
+}
+
+void AppendLoggingShapeOrDataForName(int64_t uid,
+                                     const std::string& name,
+                                     const phi::DenseTensor& tensor) {
+  static std::mutex mutex;
+  std::unique_lock<std::mutex> lock(mutex);
+  using Name2OnceFlag = std::unordered_map<std::string, std::once_flag>;
+  static std::unordered_map<int64_t, Name2OnceFlag> once_flags;
+  std::call_once(once_flags[uid][name], [&] {
+    AppendToLoggingFile(GetLoggingShapeOrDataForName(uid, name, tensor));
+  });
+}
+
+void SaveLoggingShapeOrData(const pir::Program& program, const Scope& scope) {
+  if (FLAGS_logging_pir_py_code_dir.empty()) return;
+  TryTruncateLoggingFile();
+  VisitFeedName(program, [&](const std::string& name) {
+    Variable* variable = scope.FindVar(name);
+    if (variable == nullptr) return;
+    if (!variable->IsType<phi::DenseTensor>()) return;
+    const phi::DenseTensor& tensor = variable->Get<phi::DenseTensor>();
+    AppendLoggingShapeOrDataForName(program.id(), name, tensor);
+  });
+}
+
+}  // namespace
+
+void RunFeedHooks(const pir::Program& program, const Scope& scope) {
+  SaveLoggingShapeOrData(program, scope);
+}
+
+}  // namespace paddle::framework
diff --git a/paddle/fluid/pybind/parallel_executor.h b/paddle/fluid/framework/feed_hook.h
similarity index 70%
rename from paddle/fluid/pybind/parallel_executor.h
rename to paddle/fluid/framework/feed_hook.h
index 3c3acace033a7..3a8584e3899b6 100644
--- a/paddle/fluid/pybind/parallel_executor.h
+++ b/paddle/fluid/framework/feed_hook.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -14,12 +14,16 @@
 
 #pragma once
 
-#include "pybind11/pybind11.h"
+namespace pir {
 
-namespace paddle {
-namespace pybind {
+class Program;
 
-void BindParallelExecutor(pybind11::module& m);  // NOLINT
+}
 
-}  // namespace pybind
-}  // namespace paddle
+namespace paddle::framework {
+
+class Scope;
+
+void RunFeedHooks(const pir::Program& program, const Scope& scope);
+
+}  // namespace paddle::framework
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index b3ff3ac35d96d..a5f1d3bea2e7d 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -334,6 +334,8 @@ if(WITH_XPU)
                DEPS ${XPU_PASS_DEPS})
   pass_library(weight_only_linear_xpu_pass inference DIR xpu DEPS
                ${XPU_PASS_DEPS})
+  pass_library(block_multihead_attention_xpu_pass inference DIR xpu DEPS
+               ${XPU_PASS_DEPS})
 endif()
 
 cc_library(
diff --git a/paddle/fluid/framework/ir/delete_cast_op_pass_test.cc b/paddle/fluid/framework/ir/delete_cast_op_pass_test.cc
index 17f0c642a60d1..c5480db1ca466 100644
--- a/paddle/fluid/framework/ir/delete_cast_op_pass_test.cc
+++ b/paddle/fluid/framework/ir/delete_cast_op_pass_test.cc
@@ -16,9 +16,7 @@
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 void AddVarToScope(Scope* param_scope,
                    const std::string& name,
@@ -315,8 +313,6 @@ TEST(ApplyCastPass, basic) {
                         cast_num_in_graph));
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 USE_PASS(delete_cast_op_pass);
diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator.cc b/paddle/fluid/framework/ir/fusion_group/code_generator.cc
index 2e5c2b5be4ac3..defc320495064 100644
--- a/paddle/fluid/framework/ir/fusion_group/code_generator.cc
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator.cc
@@ -17,10 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/fusion_group/code_generator_helper.h"
 #include "paddle/fluid/framework/ir/fusion_group/cuda_resources.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
-namespace fusion_group {
+namespace paddle::framework::ir::fusion_group {
 
 std::string ExtractDataType(const std::vector<Node*>& nodes) {
   std::string dtype_str = "";
@@ -373,7 +370,4 @@ std::unordered_map<Node*, int> CodeGenerator::EncodeVarNodes(
   return var_ids;
 }
 
-}  // namespace fusion_group
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir::fusion_group
diff --git a/paddle/fluid/framework/ir/merge_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/merge_layernorm_fuse_pass.cc
index 2e6aaa37808ae..1fbe22ff33021 100644
--- a/paddle/fluid/framework/ir/merge_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/merge_layernorm_fuse_pass.cc
@@ -39,9 +39,7 @@
   GET_IR_NODE(layernorm_40_in_bias);  \
   GET_IR_NODE(layernorm_40_in_scale); \
   GET_IR_NODE(layernorm_40_out);
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 MergeLayernormFusePass::MergeLayernormFusePass() {
   AddOpCompat(OpCompat("reshape2"))
       .AddInput("X")
@@ -176,9 +174,7 @@ void MergeLayernormFusePass::ApplyImpl(ir::Graph* graph) const {
   gpd(graph, handler);
   AddStatis(fusion_count);
 }
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 REGISTER_PASS(merge_layernorm_fuse_pass,
               paddle::framework::ir::MergeLayernormFusePass);
 REGISTER_PASS_CAPABILITY(merge_layernorm_fuse_pass)
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/add_reader_dependency_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/add_reader_dependency_pass.cc
index b907869b4a38e..e0b96b69116a4 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/add_reader_dependency_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/add_reader_dependency_pass.cc
@@ -16,9 +16,7 @@
 
 #include "paddle/fluid/framework/ir/pass.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 class Graph;
 
@@ -106,9 +104,7 @@ void AddReaderDependencyPass::ApplyImpl(Graph *graph) const {
   }
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(add_reader_dependency_pass,
               paddle::framework::ir::AddReaderDependencyPass);
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.cc
index f4f0e393c2499..72e8baaba5017 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.cc
@@ -17,9 +17,7 @@
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 static std::unordered_set<std::string> ReaderOpSet() {
   return {"create_py_reader"};
@@ -78,6 +76,4 @@ void SetReaderOpDeviceInfo(Graph *graph, size_t dev_cnt, size_t dev_idx) {
   VLOG(10) << "Found op number " << found_op_num;
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
diff --git a/paddle/fluid/framework/ir/onednn/compute_propagate_scales_onednn_pass.cc b/paddle/fluid/framework/ir/onednn/compute_propagate_scales_onednn_pass.cc
index 1c733636ca7b0..cf17f00fa4080 100644
--- a/paddle/fluid/framework/ir/onednn/compute_propagate_scales_onednn_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/compute_propagate_scales_onednn_pass.cc
@@ -21,9 +21,7 @@
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 void ComputePropagateScalesMkldnnPass::GetTensorFromVector(
     const std::vector<float>& data_v, phi::DenseTensor* tensor) const {
@@ -516,9 +514,7 @@ void ComputePropagateScalesMkldnnPass::ApplyImpl(ir::Graph* graph) const {
       graph, "has_quant_info", "var_quant_scales", var_quant_scales);
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(compute_propagate_scales_onednn_pass,
               paddle::framework::ir::ComputePropagateScalesMkldnnPass);
diff --git a/paddle/fluid/framework/ir/onednn/conv_elementwise_add_onednn_fuse_pass.cc b/paddle/fluid/framework/ir/onednn/conv_elementwise_add_onednn_fuse_pass.cc
index 7733730f7d605..14857f3c550d8 100644
--- a/paddle/fluid/framework/ir/onednn/conv_elementwise_add_onednn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/conv_elementwise_add_onednn_fuse_pass.cc
@@ -19,9 +19,7 @@
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/utils/string/pretty_log.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 ResidualConnectionMKLDNNFusePass::ResidualConnectionMKLDNNFusePass() {
   AddOpCompat(OpCompat("conv2d"))
@@ -305,9 +303,7 @@ void ResidualConnectionMKLDNNFusePass::ApplyImpl(ir::Graph* graph) const {
 
   AddStatis(graph_with_stats.second);
 }
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(conv_elementwise_add_onednn_fuse_pass,
               paddle::framework::ir::ResidualConnectionMKLDNNFusePass);
diff --git a/paddle/fluid/framework/ir/onednn/operator_reshape2_onednn_fuse_pass.cc b/paddle/fluid/framework/ir/onednn/operator_reshape2_onednn_fuse_pass.cc
index a21ddd579be3c..f937a1c681b17 100644
--- a/paddle/fluid/framework/ir/onednn/operator_reshape2_onednn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/operator_reshape2_onednn_fuse_pass.cc
@@ -19,9 +19,7 @@
 #include "paddle/phi/backends/onednn/onednn_reuse.h"
 #include "paddle/utils/string/pretty_log.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 using string::PrettyLogDetail;
 
@@ -132,9 +130,7 @@ void FuseOperatorReshape2OneDNNPass::FuseReshape2(Graph *graph,
                     op_type);
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(operator_reshape2_onednn_fuse_pass,
               paddle::framework::ir::FuseOperatorReshape2OneDNNPass);
diff --git a/paddle/fluid/framework/ir/onednn/squeeze2_transpose2_onednn_fuse_pass.cc b/paddle/fluid/framework/ir/onednn/squeeze2_transpose2_onednn_fuse_pass.cc
index 4af9c6a770436..7ac8edbb6005c 100644
--- a/paddle/fluid/framework/ir/onednn/squeeze2_transpose2_onednn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/squeeze2_transpose2_onednn_fuse_pass.cc
@@ -17,9 +17,7 @@
 #include "paddle/phi/backends/onednn/onednn_reuse.h"
 #include "paddle/utils/string/pretty_log.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 using string::PrettyLogDetail;
 
@@ -77,9 +75,7 @@ void FuseSqueeze2Transpose2OneDNNPass::ApplyImpl(Graph *graph) const {
   }
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(squeeze2_transpose2_onednn_fuse_pass,
               paddle::framework::ir::FuseSqueeze2Transpose2OneDNNPass);
diff --git a/paddle/fluid/framework/ir/placement_pass_base.cc b/paddle/fluid/framework/ir/placement_pass_base.cc
index ccf2bf22ab57b..718e15b01fd72 100644
--- a/paddle/fluid/framework/ir/placement_pass_base.cc
+++ b/paddle/fluid/framework/ir/placement_pass_base.cc
@@ -18,9 +18,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/operator.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 void PlacementPassBase::ApplyImpl(ir::Graph* graph) const {
   VLOG(3) << "Applies " << GetPlacementName() << " placement strategy.";
@@ -43,6 +41,4 @@ void PlacementPassBase::ApplyImpl(ir::Graph* graph) const {
   }
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
diff --git a/paddle/fluid/framework/ir/preln_elementwise_groupnorm_act_pass.cc b/paddle/fluid/framework/ir/preln_elementwise_groupnorm_act_pass.cc
index 7cbb5c169f63c..3917423754ba4 100644
--- a/paddle/fluid/framework/ir/preln_elementwise_groupnorm_act_pass.cc
+++ b/paddle/fluid/framework/ir/preln_elementwise_groupnorm_act_pass.cc
@@ -18,18 +18,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 class Node;
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
-namespace paddle {
-namespace framework {
-namespace ir {
-namespace patterns {
+namespace paddle::framework::ir::patterns {
 
 struct PrelnGroupNormAct : public PatternBase {
   PrelnGroupNormAct(PDPattern *pattern, const std::string &name_scope)
@@ -92,7 +85,8 @@ void PrelnGroupNormAct::operator()(PDNode *x, PDNode *y, bool with_act) {
   }
 }
 
-}  // namespace patterns
+}  // namespace paddle::framework::ir::patterns
+namespace paddle::framework::ir {
 
 int PrelnGroupNormActFusePass::ApplyAddGNPattern(ir::Graph *graph,
                                                  bool with_act) const {
@@ -203,9 +197,7 @@ void PrelnGroupNormActFusePass::ApplyImpl(ir::Graph *graph) const {
   AddStatis(found_subgraph_count);
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(preln_elementwise_groupnorm_act_pass,
               paddle::framework::ir::PrelnGroupNormActFusePass);
diff --git a/paddle/fluid/framework/ir/trt_qk_multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/trt_qk_multihead_matmul_fuse_pass.cc
index d68694106b5c7..c6a22c143fb66 100644
--- a/paddle/fluid/framework/ir/trt_qk_multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/trt_qk_multihead_matmul_fuse_pass.cc
@@ -22,10 +22,7 @@
 #endif
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
-namespace patterns {
+namespace paddle::framework::ir::patterns {
 
 //       input_qk   input_v
 //       |q     |k      v
@@ -249,7 +246,8 @@ PDNode* TrtQKMultiHeadMatmulPattern::operator()() {
   return reshape2_qkv_out_var;
 }
 
-}  // namespace patterns
+}  // namespace paddle::framework::ir::patterns
+namespace paddle::framework::ir {
 
 int TrtQkMultiHeadMatmulFusePass::BuildQkFusion(Graph* graph,
                                                 const std::string& name_scope,
@@ -575,9 +573,7 @@ void TrtQkMultiHeadMatmulFusePass::ApplyImpl(Graph* graph) const {
   AddStatis(fusion_count);
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(trt_qk_multihead_matmul_fuse_pass,
               paddle::framework::ir::TrtQkMultiHeadMatmulFusePass);
diff --git a/paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.cc
index 0708218dbd07c..e90cadc782a61 100644
--- a/paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.cc
@@ -22,18 +22,11 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #endif
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 class Node;
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
-namespace paddle {
-namespace framework {
-namespace ir {
-namespace patterns {
+namespace paddle::framework::ir::patterns {
 
 struct TrtSkipLayerNorm : public PatternBase {
   TrtSkipLayerNorm(PDPattern *pattern, const std::string &name_scope)
@@ -102,7 +95,8 @@ PDNode *TrtSkipLayerNorm::operator()(PDNode *x, PDNode *y) {
   return layer_norm_out_var;
 }
 
-}  // namespace patterns
+}  // namespace paddle::framework::ir::patterns
+namespace paddle::framework::ir {
 
 void TrtSkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
   PADDLE_ENFORCE_NOT_NULL(
@@ -271,9 +265,7 @@ void TrtSkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
   AddStatis(found_subgraph_count);
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(trt_skip_layernorm_fuse_pass,
               paddle::framework::ir::TrtSkipLayerNormFusePass);
diff --git a/paddle/fluid/framework/ir/trt_support_nhwc_pass.cc b/paddle/fluid/framework/ir/trt_support_nhwc_pass.cc
index d9907555a17b5..6b49a99c02364 100644
--- a/paddle/fluid/framework/ir/trt_support_nhwc_pass.cc
+++ b/paddle/fluid/framework/ir/trt_support_nhwc_pass.cc
@@ -26,9 +26,7 @@
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/phi/common/data_type.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
+namespace paddle::framework::ir {
 
 namespace {
 
@@ -383,8 +381,6 @@ void TrtSupportNHWCPass::ApplyImpl(Graph *graph) const {
   AddStatis(transposed_ops.size());
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework::ir
 
 REGISTER_PASS(trt_support_nhwc_pass, paddle::framework::ir::TrtSupportNHWCPass);
diff --git a/paddle/fluid/framework/ir/xpu/block_multihead_attention_xpu_pass.cc b/paddle/fluid/framework/ir/xpu/block_multihead_attention_xpu_pass.cc
new file mode 100644
index 0000000000000..3d4c78896f7e2
--- /dev/null
+++ b/paddle/fluid/framework/ir/xpu/block_multihead_attention_xpu_pass.cc
@@ -0,0 +1,125 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+
+#include "glog/logging.h"
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/ir/xpu/pass_utils.h"
+#include "paddle/fluid/framework/ir/xpu/quant_utils.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace phi {
+class DenseTensor;
+}  // namespace phi
+
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class BlockMultiHeadAttentionXPUPass : public FusePassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+
+ private:
+  void InplaceBlockMultiHeadAttentionXPU(ir::Graph* graph) const;
+
+  const std::string name_scope_{"block_multihead_attention_xpu_pass"};
+};
+
+void BlockMultiHeadAttentionXPUPass::ApplyImpl(ir::Graph* graph) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::PreconditionNotMet("graph should not be null."));
+  Init(name_scope_, graph);
+
+  InplaceBlockMultiHeadAttentionXPU(graph);
+}
+
+void BlockMultiHeadAttentionXPUPass::InplaceBlockMultiHeadAttentionXPU(
+    ir::Graph* graph) const {
+  const int64_t max_batch_size = 10;
+  auto* scope = param_scope();
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp() && node->Op()->Type() == "block_multihead_attention") {
+      auto* op_desc = node->Op();
+      op_desc->SetType("block_multihead_attention_xpu");
+      phi::DenseTensor cache_k_per_batch_maxs;
+      auto base_name = op_desc->Input("qkv")[0];
+      int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1);
+      std::string cache_k_per_batch_maxs_name = base_name + "_max_cache_k";
+      VarDesc cache_k_per_batch_maxs_desc(cache_k_per_batch_maxs_name);
+      cache_k_per_batch_maxs_desc.SetPersistable(true);
+      cache_k_per_batch_maxs_desc.SetShape(
+          {max_batch_size, static_cast<int64_t>(max_ptr_size)});
+      cache_k_per_batch_maxs_desc.SetDataType(
+          proto::VarType::Type::VarType_Type_FP32);
+      Node* cache_k_per_batch_maxs_in =
+          graph->CreateVarNode(&cache_k_per_batch_maxs_desc);
+      phi::DenseTensor cpu_tensor;
+      auto* cpu_ctx = static_cast<phi::CPUContext*>(
+          platform::DeviceContextPool::Instance().Get(phi::CPUPlace()));
+      cpu_tensor.set_type(phi::DataType::FLOAT32);
+      cpu_tensor.Resize({max_batch_size, max_ptr_size});
+      std::vector<float> tmp(max_batch_size * max_ptr_size, 0);
+      memcpy(cpu_ctx->Alloc<float>(&cpu_tensor),
+             tmp.data(),
+             max_batch_size * max_ptr_size * sizeof(float));
+      Assign(cpu_tensor,
+             scope->Var(cache_k_per_batch_maxs_name)
+                 ->GetMutable<phi::DenseTensor>());
+      op_desc->SetInput("cache_k_per_batch_maxs",
+                        {cache_k_per_batch_maxs_name});
+
+      std::string cache_v_per_batch_maxs_name = base_name + "_max_cache_v";
+      VarDesc cache_v_per_batch_maxs_desc(cache_v_per_batch_maxs_name);
+      cache_v_per_batch_maxs_desc.SetPersistable(true);
+      cache_v_per_batch_maxs_desc.SetShape(
+          {max_batch_size, static_cast<int64_t>(max_ptr_size)});
+      cache_v_per_batch_maxs_desc.SetDataType(
+          proto::VarType::Type::VarType_Type_FP32);
+      Node* cache_v_per_batch_maxs_in =
+          graph->CreateVarNode(&cache_v_per_batch_maxs_desc);
+      Assign(cpu_tensor,
+             scope->Var(cache_v_per_batch_maxs_name)
+                 ->GetMutable<phi::DenseTensor>());
+      op_desc->SetInput("cache_v_per_batch_maxs",
+                        {cache_v_per_batch_maxs_name});
+
+      IR_NODE_LINK_TO(cache_k_per_batch_maxs_in, node);
+      IR_NODE_LINK_TO(cache_v_per_batch_maxs_in, node);
+    }
+  }
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(block_multihead_attention_xpu_pass,
+              paddle::framework::ir::BlockMultiHeadAttentionXPUPass);
+
+REGISTER_PASS_CAPABILITY(block_multihead_attention_xpu_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination().EQ(
+            "block_multihead_attention_xpu", 0));
diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_util.cc b/paddle/fluid/framework/new_executor/instruction/instruction_util.cc
index 609fd78106747..e7a05d75f6e99 100644
--- a/paddle/fluid/framework/new_executor/instruction/instruction_util.cc
+++ b/paddle/fluid/framework/new_executor/instruction/instruction_util.cc
@@ -42,8 +42,7 @@
 COMMON_DECLARE_bool(dynamic_static_unified_comm);
 #endif
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 
 std::vector<int> GetValueIds(pir::Value value,
                              const ValueExecutionInfo& value_exec_info) {
@@ -407,5 +406,4 @@ bool GetCondData(const phi::DenseTensor& cond) {
   return cpu_cond->data<bool>()[0];
 }
 
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
diff --git a/paddle/fluid/framework/new_executor/instruction/onednn/onednn_instruction.cc b/paddle/fluid/framework/new_executor/instruction/onednn/onednn_instruction.cc
index 9af41b9e8c08b..b8a56321b9e66 100644
--- a/paddle/fluid/framework/new_executor/instruction/onednn/onednn_instruction.cc
+++ b/paddle/fluid/framework/new_executor/instruction/onednn/onednn_instruction.cc
@@ -405,9 +405,12 @@ OneDNNPhiKernelInstruction::~OneDNNPhiKernelInstruction() {
 }
 
 void OneDNNPhiKernelInstruction::Run() {
+  std::vector<std::shared_ptr<phi::DenseTensor>> tmp_holders;
+  auto tmp_kernel_context = kernel_context_;
+  auto tmp_infer_meta_context_ = infer_meta_context_;
   // Step1. TransLayout
-  auto inputs = kernel_context_.InputsBetween<phi::DenseTensor>(
-      size_t(0), kernel_context_.InputsSize());
+  auto inputs = tmp_kernel_context.InputsBetween<phi::DenseTensor>(
+      size_t(0), tmp_kernel_context.InputsSize());
   for (size_t i = 0; i < inputs.size(); ++i) {
     auto input = inputs[i];
     if (input == nullptr) {
@@ -419,10 +422,12 @@ void OneDNNPhiKernelInstruction::Run() {
     if (skip_format_tensors_.count(i)) {
       continue;
     }
-    VLOG(6) << "input[" << i << "].layout() = " << input->layout();
+    VLOG(6) << "input[" << i << "].layout() = " << input->layout()
+            << ", shape = " << input->dims();
     if (input->layout() != phi::DataLayout::ONEDNN) {
       phi::DataLayout from_layout = input->layout();
-      auto transed_tensor = const_cast<phi::DenseTensor*>(input);
+      tmp_holders.emplace_back(std::make_shared<phi::DenseTensor>(*input));
+      auto transed_tensor = tmp_holders.back().get();
 
       std::set<std::string> elementwise_kernels = {
           "add", "subtract", "multiply", "divide"};
@@ -461,8 +466,24 @@ void OneDNNPhiKernelInstruction::Run() {
       }
 
       dnnl::memory::desc out_mem_desc =
-          phi::funcs::make_memory_desc(*input, from_layout);
+          phi::funcs::make_memory_desc(*transed_tensor, from_layout);
       transed_tensor->set_mem_desc(out_mem_desc);
+      tmp_kernel_context.UpdataInput(i, transed_tensor);
+      auto meta_tensor = phi::MetaTensor(transed_tensor);
+      auto input_meta_tensor = phi::MetaTensor(input);
+      if (tmp_infer_meta_context_.InputsSize() > i &&
+          tmp_infer_meta_context_.InputAt(i).is_same_tensor(
+              input_meta_tensor)) {
+        tmp_infer_meta_context_.UpdataInput(i, meta_tensor);
+      } else {
+        for (size_t j = 0; j < tmp_infer_meta_context_.InputsSize(); ++j) {
+          if (tmp_infer_meta_context_.InputAt(j).is_same_tensor(
+                  input_meta_tensor)) {
+            tmp_infer_meta_context_.UpdataInput(j, meta_tensor);
+            break;
+          }
+        }
+      }
     }
   }
 
@@ -470,7 +491,7 @@ void OneDNNPhiKernelInstruction::Run() {
   // SetDnnAttrIntoDeviceContext
   // SetInputsName SetOutputsName
   auto one_dnn_ctx = const_cast<phi::OneDNNContext*>(
-      &kernel_context_.GetDeviceContext<phi::OneDNNContext>());
+      &tmp_kernel_context.GetDeviceContext<phi::OneDNNContext>());
   for (auto& attr : extra_attr_) {
     one_dnn_ctx->SetDnnAttr(attr.first, attr.second);
   }
@@ -482,12 +503,12 @@ void OneDNNPhiKernelInstruction::Run() {
 
   // Step3. InferMeta
   if (infer_meta_interface_) {
-    infer_meta_interface_->infer_meta_(&(infer_meta_context_));
+    infer_meta_interface_->infer_meta_(&(tmp_infer_meta_context_));
   }
 
   // Step4. Run kernel
   VLOG(6) << "Run op " << phi_op_name_ << " infer meta.";
-  (*(phi_kernel_))(&(kernel_context_));
+  (*(phi_kernel_))(&(tmp_kernel_context));
   VLOG(6) << "Run op " << phi_op_name_ << " kernel.";
 
   // Step5. ClearDnnAttr
diff --git a/paddle/fluid/framework/new_executor/instruction/onednn/onednn_mixed_instruction.cc b/paddle/fluid/framework/new_executor/instruction/onednn/onednn_mixed_instruction.cc
index 0115f2f4b9f31..3f72973e37a3e 100644
--- a/paddle/fluid/framework/new_executor/instruction/onednn/onednn_mixed_instruction.cc
+++ b/paddle/fluid/framework/new_executor/instruction/onednn/onednn_mixed_instruction.cc
@@ -58,6 +58,7 @@ OneDNNMixedPhiKernelInstruction::OneDNNMixedPhiKernelInstruction(
 }
 
 void OneDNNMixedPhiKernelInstruction::Run() {
+  std::vector<std::shared_ptr<phi::DenseTensor>> tmp_holders;
   // Step1. Mixed Dynamic Choose Kernel
   if (!has_choose_kernel_) {
     has_choose_kernel_ = true;
@@ -76,9 +77,11 @@ void OneDNNMixedPhiKernelInstruction::Run() {
   if (use_onednn_kernel_) {
     OneDNNPhiKernelInstruction::Run();
   } else {
+    auto tmp_kernel_context = kernel_context_;
+    auto tmp_infer_meta_context_ = infer_meta_context_;
     // TransLayout first
-    auto inputs = kernel_context_.InputsBetween<phi::DenseTensor>(
-        size_t(0), kernel_context_.InputsSize());
+    auto inputs = tmp_kernel_context.InputsBetween<phi::DenseTensor>(
+        size_t(0), tmp_kernel_context.InputsSize());
 
     for (size_t i = 0; i < inputs.size(); ++i) {
       auto input = inputs[i];
@@ -89,30 +92,66 @@ void OneDNNMixedPhiKernelInstruction::Run() {
         // NOTE(zhiqiu): to handle the special case in ApplyDataTransform() in
         // data_transfer.cc
         if (!input->IsInitialized() && tmp_layout == DataLayout::NHWC) {
-          auto transed_tensor = const_cast<phi::DenseTensor*>(input);
+          tmp_holders.emplace_back(std::make_shared<phi::DenseTensor>(*input));
+          auto transed_tensor = tmp_holders.back().get();
           transed_tensor->set_layout(tmp_layout);
           phi::funcs::MatchShapeToLayout(
               transed_tensor, phi::DataLayout::ONEDNN, tmp_layout);
+          dnnl::memory::desc out_mem_desc =
+              phi::funcs::make_memory_desc(*transed_tensor, tmp_layout);
+          transed_tensor->set_mem_desc(out_mem_desc);
+          tmp_kernel_context.UpdataInput(i, transed_tensor);
+          auto meta_tensor = phi::MetaTensor(transed_tensor);
+          auto input_meta_tensor = phi::MetaTensor(input);
+          if (tmp_infer_meta_context_.InputsSize() > i &&
+              tmp_infer_meta_context_.InputAt(i).is_same_tensor(
+                  input_meta_tensor)) {
+            tmp_infer_meta_context_.UpdataInput(i, meta_tensor);
+          } else {
+            for (size_t j = 0; j < tmp_infer_meta_context_.InputsSize(); ++j) {
+              if (tmp_infer_meta_context_.InputAt(j).is_same_tensor(
+                      input_meta_tensor)) {
+                tmp_infer_meta_context_.UpdataInput(j, meta_tensor);
+                break;
+              }
+            }
+          }
         } else {
-          phi::DenseTensor transed_tensor;
-          transed_tensor.set_meta(input->meta());
+          tmp_holders.emplace_back(std::make_shared<phi::DenseTensor>());
+          auto transed_tensor = tmp_holders.back().get();
+          transed_tensor->set_meta(input->meta());
           phi::funcs::TransDataLayoutFromOneDNN(phi::DataLayout::ONEDNN,
                                                 tmp_layout,
                                                 *input,
-                                                &transed_tensor,
+                                                transed_tensor,
                                                 phi::CPUPlace());
-          *(const_cast<phi::DenseTensor*>(input)) = transed_tensor;
+          tmp_kernel_context.UpdataInput(i, transed_tensor);
+          auto meta_tensor = phi::MetaTensor(transed_tensor);
+          auto input_meta_tensor = phi::MetaTensor(input);
+          if (tmp_infer_meta_context_.InputsSize() > i &&
+              tmp_infer_meta_context_.InputAt(i).is_same_tensor(
+                  input_meta_tensor)) {
+            tmp_infer_meta_context_.UpdataInput(i, meta_tensor);
+          } else {
+            for (size_t j = 0; j < tmp_infer_meta_context_.InputsSize(); ++j) {
+              if (tmp_infer_meta_context_.InputAt(j).is_same_tensor(
+                      input_meta_tensor)) {
+                tmp_infer_meta_context_.UpdataInput(j, meta_tensor);
+                break;
+              }
+            }
+          }
         }
       }
     }
 
     VLOG(6) << "Begin run op " << phi_op_name_ << " infer meta.";
     if (infer_meta_interface_) {
-      infer_meta_interface_->infer_meta_(&(infer_meta_context_));
+      infer_meta_interface_->infer_meta_(&(tmp_infer_meta_context_));
     }
     VLOG(6) << "End run op " << phi_op_name_ << " infer meta.";
     VLOG(6) << "Begin run op " << phi_op_name_ << " kernel.";
-    (*(phi_kernel_))(&(kernel_context_));
+    (*(phi_kernel_))(&(tmp_kernel_context));
     VLOG(6) << "End run op " << phi_op_name_ << " kernel.";
   }
 }
diff --git a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc
index 54ee746726e7e..96f21e1a534c0 100644
--- a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc
@@ -431,12 +431,24 @@ void analyse_event_info_for_two_instructions<Instruction>(
 
   if (has_data_dependency<Instruction, std::string>(
           instructions[cur_instr_id], instructions[next_instr_id]) ||
-      !run_type_info[next_instr_id][DownstreamRunType::kEventRun].empty() ||
       instructions[next_instr_id]->OpBase()->Type() == "depend") {
     waiter_instr_ids->insert(next_instr_id);
     return;
   }
 
+  if (!run_type_info[next_instr_id][DownstreamRunType::kEventRun].empty()) {
+    auto& next_next_instructor_ids =
+        run_type_info[next_instr_id][DownstreamRunType::kEventRun];
+    for (auto& id : next_next_instructor_ids) {
+      if (has_data_dependency<Instruction, std::string>(
+              instructions[cur_instr_id], instructions[id])) {
+        waiter_instr_ids->insert(next_instr_id);
+        return;
+      }
+    }
+    return;
+  }
+
   // NOTE(Ruibiao): If no data dependency from cur_instr to next_instr, and
   // simultaneously next_instr has no event_run downstream instr, we try to
   // recursively add events between cur_instr and next_instr's
@@ -491,12 +503,25 @@ void analyse_event_info_for_two_instructions<
 
   if (has_data_dependency<paddle::framework::InstructionBase, pir::Value>(
           instructions[cur_instr_id], instructions[next_instr_id]) ||
-      !run_type_info[next_instr_id][DownstreamRunType::kEventRun].empty() ||
       instructions[next_instr_id]->Name() == "pd_op.depend") {
     waiter_instr_ids->insert(next_instr_id);
     return;
   }
 
+  if (!run_type_info[next_instr_id][DownstreamRunType::kEventRun].empty()) {
+    auto& next_next_instructor_ids =
+        run_type_info[next_instr_id][DownstreamRunType::kEventRun];
+    for (auto& id : next_next_instructor_ids) {
+      if (has_data_dependency<paddle::framework::InstructionBase, pir::Value>(
+              instructions[cur_instr_id], instructions[id])) {
+        waiter_instr_ids->insert(next_instr_id);
+        return;
+      }
+    }
+
+    return;
+  }
+
   // NOTE(Ruibiao): If no data dependency from cur_instr to next_instr, and
   // simultaneously next_instr has no event_run downstream instr, we try to
   // recursively add events between cur_instr and next_instr's
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index 416d46c01e1f2..d5fe408d53401 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -35,8 +35,7 @@ PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope,
                             "Use local_scope in new executor(especially used "
                             "in UT), can turn off for better performance");
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 
 InterpreterCore::InterpreterCore(const platform::Place& place,
                                  const BlockDesc& block,
@@ -170,5 +169,4 @@ Variable* InterpreterCore::DebugVar(const std::string& name) const {
   return impl_->DebugVar(name);
 }
 
-}  // namespace framework
-}  // namespace paddle
+}  // namespace paddle::framework
diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc
index 4e4b41579f4fe..3374d38ccaae6 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -13,14 +13,14 @@
 // limitations under the License.
 #include "paddle/fluid/framework/new_executor/standalone_executor.h"
 #include "paddle/common/flags.h"
+#include "paddle/fluid/framework/feed_hook.h"
 #include "paddle/fluid/framework/new_executor/feed_fetch_utils.h"
 #include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h"
 #include "paddle/fluid/framework/new_executor/pir_interpreter.h"
 #include "paddle/fluid/framework/new_executor/program_interpreter.h"
 #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
-#include "paddle/fluid/platform/profiler/event_tracing.h"
-
 #include "paddle/fluid/pir/transforms/pd_op_to_kernel_pass.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 
 #include "paddle/fluid/ir_adaptor/translator/translate.h"
 #include "paddle/fluid/pir/transforms/general/inplace_pass.h"
@@ -66,6 +66,7 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place,
     std::shared_ptr<::pir::Program> ir_program = nullptr;
     if (FLAGS_enable_pir_api || FLAGS_enable_pir_in_executor) {  // NOLINT
       ir_program = plan_.IrProgram(job_type);
+      RunFeedHooks(*ir_program, *scope);
     } else {
       // NOTE (liuchenghao): std::make_shared will duplicate ProgramDesc object,
       // maybe std::make_unique is better?
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 9f4f46c60cea4..2a39e664276ed 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -39,8 +39,7 @@ COMMON_DECLARE_bool(check_nan_inf);
 PD_DECLARE_bool(benchmark);
 COMMON_DECLARE_bool(run_kp_kernel);
 
-namespace paddle {
-namespace imperative {
+namespace paddle::imperative {
 
 static const phi::Kernel empty_kernel;
 static const framework::RuntimeContext empty_ctx({}, {});
@@ -752,5 +751,4 @@ void PreparedOp::Run(const NameVarMap<egr::EagerVariable>& ins,
   }
 }
 
-}  // namespace imperative
-}  // namespace paddle
+}  // namespace paddle::imperative
diff --git a/paddle/fluid/inference/analysis/passes/save_optimized_model_pass.cc b/paddle/fluid/inference/analysis/passes/save_optimized_model_pass.cc
index aaf9439d2b9ed..e8b8c27a24e58 100644
--- a/paddle/fluid/inference/analysis/passes/save_optimized_model_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/save_optimized_model_pass.cc
@@ -20,9 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/scope.h"
 
-namespace paddle {
-namespace inference {
-namespace analysis {
+namespace paddle::inference::analysis {
 
 void SaveOptimizedModelPass::SaveOptimizedModel(Argument* argument) {
   std::string model_opt_cache_dir = argument->optimized_model_save_path();
@@ -137,6 +135,4 @@ std::string SaveOptimizedModelPass::repr() const {
   return "save_optimized_model_pass";
 }
 
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::analysis
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index adb7021633b8e..7a211edc2a699 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -28,6 +28,7 @@
 #include "paddle/fluid//platform/device/gpu/gpu_types.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/feed_hook.h"
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/naive_executor.h"
@@ -1444,7 +1445,9 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
     LOG(ERROR) << "fail to set feed";
     return false;
   }
-
+  if (config_.new_ir_enabled()) {
+    ::paddle::framework::RunFeedHooks(*pir_program_, *scope);
+  }
 #ifdef PADDLE_WITH_TENSORRT
   if (config_.tensorrt_engine_enabled()) {
     inference::tensorrt::TensorRTEngine::predictor_id_per_thread =
@@ -1519,7 +1522,9 @@ bool AnalysisPredictor::Run(const std::vector<paddle::Tensor> &inputs,
     LOG(ERROR) << "fail to set feed";
     return false;
   }
-
+  if (config_.new_ir_enabled()) {
+    ::paddle::framework::RunFeedHooks(*pir_program_, *scope);
+  }
 #ifdef PADDLE_WITH_TENSORRT
   if (config_.tensorrt_engine_enabled()) {
     inference::tensorrt::TensorRTEngine::predictor_id_per_thread =
diff --git a/paddle/fluid/inference/api/demo_ci/run_windows_demo.bat b/paddle/fluid/inference/api/demo_ci/run_windows_demo.bat
index 6eb932a190654..4bb859becf70c 100644
--- a/paddle/fluid/inference/api/demo_ci/run_windows_demo.bat
+++ b/paddle/fluid/inference/api/demo_ci/run_windows_demo.bat
@@ -65,12 +65,12 @@ if /i "%use_gpu%"=="Y" (
   set use_gpu=N
 )
 
-rem set_path_vs_command_prompt 
+rem set_path_vs_command_prompt
 :set_vcvarsall_dir
 SET /P vcvarsall_dir="Please input the path of visual studio command Prompt, such as C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat   =======>"
 set tmp_var=!vcvarsall_dir!
 call:remove_space
-set vcvarsall_dir=!tmp_var!   
+set vcvarsall_dir=!tmp_var!
 IF NOT EXIST "%vcvarsall_dir%" (
     echo "------------%vcvarsall_dir% not exist------------"
     goto set_vcvarsall_dir
@@ -104,18 +104,18 @@ if EXIST "%source_path%\%model_name%.tar.gz" (
     SET /P python_path="Please input the path of python.exe, such as C:\Python37\python.exe =======>"
     set tmp_var=!python_path!
     call:remove_space
-    set python_path=!tmp_var!   
+    set python_path=!tmp_var!
     if "!python_path!"=="" (
       set python_path=python.exe
     ) else (
       if NOT exist "!python_path!" (
-        echo "------------!python_path! not exist------------" 
+        echo "------------!python_path! not exist------------"
         goto:eof
-      )  
+      )
     )
     md %source_path%\%model_name%
     !python_path! %source_path%\untar_model.py %source_path%\%model_name%.tar.gz %source_path%\%model_name%
-    
+
     SET error_code=N
     if "%model_name%"=="mobilenet" (
       if NOT EXIST "%source_path%\%model_name%\model" set error_code=Y
@@ -127,7 +127,7 @@ if EXIST "%source_path%\%model_name%.tar.gz" (
        del /f /s /q "%source_path%\%model_name%\*.*" >nul 2>&1
        rd /s /q  "%source_path%\%model_name%" >nul 2>&1
        goto:eof
-    )  
+    )
   )
 )
 
@@ -201,7 +201,7 @@ if /i "%use_gpu%"=="Y" (
 )
 
 if exist "%build_path%\Release\%demo_name%.exe" (
-  cd %build_path%\Release 
+  cd %build_path%\Release
   set GLOG_v=4
   if "%demo_name%"=="simple_on_word2vec" (
       %demo_name%.exe --dirname="%source_path%\%model_name%\%model_name%" --use_gpu="%use_gpu%"
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index 57f8066df1eeb..d8206093efa53 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -60,11 +60,6 @@ void Tensor::Reshape(const std::vector<int> &shape) {
           "No tensor called [%s] in the runtime scope", name_));
   auto *tensor = var->GetMutable<phi::DenseTensor>();
   tensor->Resize(common::make_ddim(shape));
-#ifdef PADDLE_WITH_DNNL
-  if (tensor->layout() == phi::DataLayout::ONEDNN) {
-    tensor->set_layout(phi::DataLayout::ANY);
-  }
-#endif
 }
 
 void Tensor::ReshapeStrings(const size_t &shape) {
@@ -212,11 +207,6 @@ void Tensor::CopyFromCpu(const T *data) {
   if (place_ == PlaceType::kCPU) {
     auto *t_data = tensor->mutable_data<T>(paddle::platform::CPUPlace());
     std::memcpy(static_cast<void *>(t_data), data, ele_size);
-#ifdef PADDLE_WITH_DNNL
-    if (tensor->layout() == phi::DataLayout::ONEDNN) {
-      tensor->set_layout(phi::DataLayout::ANY);
-    }
-#endif
   } else if (place_ == PlaceType::kGPU) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index a296074f9d6cf..45c2d5607afde 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -538,6 +538,7 @@ XpuPassStrategy::XpuPassStrategy() : PassStrategy({}) {
       "group_norm_silu_xpu_fuse_pass",
       "embedding_with_eltwise_add_xpu_fuse_pass",
       "qk_qkv_attention_xpu_fuse_pass",
+      "block_multihead_attention_xpu_pass",
       "multi_encoder_xpu_fuse_pass",
       "multi_encoder_xpu_adaptive_seqlen_fuse_pass",
       "multi_encoder_xpu_slice_fuse_pass",
@@ -613,11 +614,13 @@ const std::vector<std::string> kPirGpuPasses{
     "fused_weight_only_linear_pass",
     "matmul_add_act_fuse_pass",
     "fc_elementwise_layernorm_fuse_pass",
+    "add_norm_fuse_pass",
     "matmul_scale_fuse_pass",
     "matmul_transpose_fuse_pass",
     "transpose_flatten_concat_fuse_pass",
     "remove_redundant_transpose_pass",
-    "transfer_layout_pass"};
+    "transfer_layout_pass",
+};
 
 const std::vector<std::string> kPirXpuPasses{// Functional pass
                                              "map_op_to_another_pass",
diff --git a/paddle/fluid/inference/paddle_inference.map b/paddle/fluid/inference/paddle_inference.map
index 267dcf7fb601d..180d4e643ba23 100644
--- a/paddle/fluid/inference/paddle_inference.map
+++ b/paddle/fluid/inference/paddle_inference.map
@@ -71,7 +71,7 @@
 			/* *paddle::framework*; */
 			*paddle::framework::InitDevices*;
 			*paddle::framework::InitMemoryMethod*;
-                        
+
 			*paddle::framework::InterpreterCore*;
 			*paddle::framework::Executor*;
 			*paddle::framework::proto*;
diff --git a/paddle/fluid/inference/tensorrt/convert/conv3d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv3d_op.cc
index 37a53d31f47b5..547ec74c19fa6 100644
--- a/paddle/fluid/inference/tensorrt/convert/conv3d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv3d_op.cc
@@ -14,9 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 
 template <typename RegistFunc, typename SetDilationFunc>
 void ConvertConv3d(TensorRTEngine* engine,
@@ -192,9 +190,7 @@ class Deconv3dOpConverter : public OpConverter {
   }
 };
 
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
 
 REGISTER_TRT_OP_CONVERTER(conv3d, Conv3dOpConverter);
 REGISTER_TRT_OP_CONVERTER(conv3d_transpose, Deconv3dOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
index d3fda4cb24e28..f505c36b2ed5c 100644
--- a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
@@ -14,9 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 
 // LeakyRelu converter from fluid to tensorRT
 class LeakyReluOpConverter : public OpConverter {
@@ -121,8 +119,6 @@ class LeakyReluOpConverter : public OpConverter {
   }
 };
 
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
 
 REGISTER_TRT_OP_CONVERTER(leaky_relu, LeakyReluOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/matrix_multiply_op.cc b/paddle/fluid/inference/tensorrt/convert/matrix_multiply_op.cc
index 16d6f3f20750c..fd72f8b78f9af 100644
--- a/paddle/fluid/inference/tensorrt/convert/matrix_multiply_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/matrix_multiply_op.cc
@@ -12,9 +12,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/phi/common/data_type.h"
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 
 /*
  * After trt_map_ops_to_matrix_multiply_pass(mul, matmul, matmul_v2 ->
@@ -266,8 +264,6 @@ class MatrixMultiplyOpConverter : public OpConverter {
   }
 };
 
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
 
 REGISTER_TRT_OP_CONVERTER(matrix_multiply, MatrixMultiplyOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/multiclass_nms3_op.cc b/paddle/fluid/inference/tensorrt/convert/multiclass_nms3_op.cc
index 107217477d14f..f2d00ab4b4667 100644
--- a/paddle/fluid/inference/tensorrt/convert/multiclass_nms3_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/multiclass_nms3_op.cc
@@ -13,9 +13,7 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 
 class MultiClassNMS3OpConverter : public OpConverter {
  public:
@@ -170,8 +168,6 @@ class MultiClassNMS3OpConverter : public OpConverter {
   }
 };
 
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
 
 REGISTER_TRT_OP_CONVERTER(multiclass_nms3, MultiClassNMS3OpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/transformer_input_convert_op.cc b/paddle/fluid/inference/tensorrt/convert/transformer_input_convert_op.cc
index 1dca9bb818c38..f7fda67a3643f 100644
--- a/paddle/fluid/inference/tensorrt/convert/transformer_input_convert_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/transformer_input_convert_op.cc
@@ -15,9 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/plugin/transformer_input_output_convert_plugin.h"
 
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 
 /*
  * Convert Transformer Input(pos_id, max_seqlen).
@@ -58,8 +56,6 @@ class TransformerInputConvert : public OpConverter {
   }
 };
 
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
 
 REGISTER_TRT_OP_CONVERTER(transformer_input_convert, TransformerInputConvert);
diff --git a/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc b/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc
index d87c9af8cfa67..ae12901e7da90 100644
--- a/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc
@@ -29,9 +29,7 @@ limitations under the License. */
 #include "paddle/phi/common/float16.h"
 
 using float16 = phi::dtype::float16;
-namespace paddle {
-namespace inference {
-namespace tensorrt {
+namespace paddle::inference::tensorrt {
 
 class TensorRTDynamicShapeValueEngineTest : public ::testing::Test {
  public:
@@ -1049,6 +1047,4 @@ TEST_F(TensorRTDynamicShapeGNTest, test_trt_dynamic_shape_groupnorm) {
 }
 */
 #endif
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
+}  // namespace paddle::inference::tensorrt
diff --git a/paddle/fluid/ir_adaptor/translator/op_compat_info.cc.j2 b/paddle/fluid/ir_adaptor/translator/op_compat_info.cc.j2
index e7b7812fe61be..71c38e487c909 100644
--- a/paddle/fluid/ir_adaptor/translator/op_compat_info.cc.j2
+++ b/paddle/fluid/ir_adaptor/translator/op_compat_info.cc.j2
@@ -2,7 +2,7 @@
 
 namespace paddle {
 namespace translator {
-    
+
 OpNameNormalizer::OpNameNormalizer() {
     op_name_mappings = {
         {% for legacy_name, normalized_name in op_name_pairs.items() %}
@@ -11,35 +11,35 @@ OpNameNormalizer::OpNameNormalizer() {
     };
     op_arg_name_mappings = {
         {% for op_name, arg_name_mappings in op_arg_name_pairs.items() %}
-        { 
-            "{{op_name}}", 
+        {
+            "{{op_name}}",
             {
                 {% for normalized_name, legacy_name in arg_name_mappings.items() %}
                 { "{{normalized_name}}", "{{legacy_name}}" },
                 {% endfor %}
-            }, 
+            },
         },
         {% endfor %}
     };
     op_mutable_attributes = {
         {% for op_name, mutable_attributes in op_mutable_attributes.items() %}
-        { 
-            "{{op_name}}", 
+        {
+            "{{op_name}}",
             {
                 {% for attribute_name in mutable_attributes %}
                 "{{attribute_name}}",
                 {% endfor %}
-            }, 
+            },
         },
         {% endfor %}
     };
     op_mutable_attribute_infos = {
         {% for op_name, mutable_attribute_infos in op_mutable_attribute_infos.items() %}
-        { 
-            "{{op_name}}", 
+        {
+            "{{op_name}}",
             {
                 {% for attribute_name, attribute_info in mutable_attribute_infos.items() %}
-                { 
+                {
                     "{{attribute_name}}",
                     {
                     {% for candidate_var_name in attribute_info  %}
@@ -48,7 +48,7 @@ OpNameNormalizer::OpNameNormalizer() {
                     },
                 },
                 {% endfor %}
-            }, 
+            },
         },
         {% endfor %}
     };
diff --git a/paddle/fluid/jit/property.proto b/paddle/fluid/jit/property.proto
index 5f89e1da90b91..a00da9fc6e40a 100644
--- a/paddle/fluid/jit/property.proto
+++ b/paddle/fluid/jit/property.proto
@@ -84,7 +84,7 @@ message TensorProto {
   // For int64.
   // When this field is present, the data_type field MUST be INT64
   repeated int64 int64_data = 7 [packed = true];
-  
+
   // For double
   // Complex128 tensors are encoded as a single array of doubles,
   // with the real components appearing in odd numbered positions,
@@ -130,16 +130,16 @@ message ValueProto {
     STRINGS = 8;
     TENSORS = 9;
   }
-  optional string name = 1;           
-  
+  optional string name = 1;
+
   optional AttributeType type = 2;   // discriminator that indicates which field below is in use
-  
+
   // Exactly ONE of the following fields must be present
   optional float f = 3;               // float
   optional int64 i = 4;               // int
   optional bytes s = 5;               // UTF-8 string
   optional TensorProto t = 6;         // tensor value
-  
+
   repeated float floats = 7;          // list of floats
   repeated int64 ints = 8;            // list of ints
   repeated bytes strings = 9;         // list of UTF-8 strings
@@ -147,5 +147,5 @@ message ValueProto {
 }
 
 message PropertyVals {
-  repeated ValueProto entrys=1;  
+  repeated ValueProto entrys=1;
 }
diff --git a/paddle/fluid/memory/allocation/cpu_allocator.cc b/paddle/fluid/memory/allocation/cpu_allocator.cc
index 398c015627860..426eeeae70e55 100644
--- a/paddle/fluid/memory/allocation/cpu_allocator.cc
+++ b/paddle/fluid/memory/allocation/cpu_allocator.cc
@@ -19,9 +19,7 @@
 #include "paddle/fluid/memory/stats.h"
 #include "paddle/fluid/platform/enforce.h"
 
-namespace paddle {
-namespace memory {
-namespace allocation {
+namespace paddle::memory::allocation {
 
 bool CPUAllocator::IsAllocThreadSafe() const { return true; }
 
@@ -52,6 +50,4 @@ phi::Allocation *CPUAllocator::AllocateImpl(size_t size) {
   HOST_MEMORY_STAT_UPDATE(Reserved, 0, size);
   return new Allocation(p, size, platform::CPUPlace());
 }
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
+}  // namespace paddle::memory::allocation
diff --git a/paddle/fluid/memory/allocation/memory_block_desc.cc b/paddle/fluid/memory/allocation/memory_block_desc.cc
index d20d56a6d05e8..1d1f3c2396921 100644
--- a/paddle/fluid/memory/allocation/memory_block_desc.cc
+++ b/paddle/fluid/memory/allocation/memory_block_desc.cc
@@ -17,9 +17,7 @@ limitations under the License. */
 
 #include "paddle/fluid/memory/allocation/memory_block.h"
 
-namespace paddle {
-namespace memory {
-namespace detail {
+namespace paddle::memory::detail {
 
 MemoryBlock::Desc::Desc(MemoryBlock::Type t,
                         size_t i,
@@ -74,6 +72,4 @@ bool MemoryBlock::Desc::CheckGuards() const {
   return guard_begin == hash(*this, 1) && guard_end == hash(*this, 2);
 }
 
-}  // namespace detail
-}  // namespace memory
-}  // namespace paddle
+}  // namespace paddle::memory::detail
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 4714f3a2eb446..fc28e02b7bdb9 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -82,7 +82,7 @@ endif()
 
 set(OP_HEADER_DEPS ${OP_HEADER_DEPS} phi common phi_utils static_prim_api get_expected_kernel_func)
 
-register_operators(EXCLUDES py_func_op generated_op1 generated_op2 generated_op3 generated_op4 load_combine_op lstm_op run_program_op quantize_linear_op
+register_operators(EXCLUDES py_func_op generated_op1 generated_op2 generated_op3 generated_op4 load_combine_op run_program_op quantize_linear_op
         save_combine_op sync_batch_norm_op activation_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS} processgroup_comm_utils)
 
 op_library(generated_op UNITY SRCS generated_op1.cc generated_op2.cc generated_op3.cc generated_op4.cc DEPS ${OP_HEADER_DEPS})
@@ -108,8 +108,6 @@ if (WITH_GPU OR WITH_ROCM)
     op_library(sync_batch_norm_op DEPS processgroup_comm_utils)
 endif()
 
-op_library(lstm_op DEPS ${OP_HEADER_DEPS})
-
 set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
 
 if (WITH_DGC)
diff --git a/paddle/fluid/operators/assign_pos_op.cc b/paddle/fluid/operators/assign_pos_op.cc
deleted file mode 100644
index 7def3a0cac503..0000000000000
--- a/paddle/fluid/operators/assign_pos_op.cc
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-class AssignPosOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(
-        ctx->HasInput("cum_count"), "Input", "cum_count", "AssignPos");
-    OP_INOUT_CHECK(
-        ctx->HasInput("eff_num_len"), "Input", "eff_num_len", "AssignPos");
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "AssignPos");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "AssignPos");
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto cum_count_dtype =
-        OperatorWithKernel::IndicateVarDataType(ctx, "cum_count");
-    auto X_dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X");
-
-    PADDLE_ENFORCE_EQ(cum_count_dtype,
-                      X_dtype,
-                      phi::errors::InvalidArgument(
-                          "The dtype of the cum_count and X should be same"));
-    PADDLE_ENFORCE_EQ(cum_count_dtype,
-                      framework::proto::VarType::INT64,
-                      phi::errors::InvalidArgument(
-                          "The dtype of the cum_count_dtype, eff_num_len and "
-                          "X should be same as int64"));
-    return phi::KernelKey(cum_count_dtype, ctx.device_context().GetPlace());
-  }
-};
-
-class AssignPosOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "numbers to scatter.");
-    AddInput("cum_count", "The cumulative sum count of numbers.");
-    AddInput("eff_num_len",
-             "The effective numbers of numbers should be scattered.");
-    AddOutput("Out", "Assemble numbers in the order of counters.");
-
-    AddComment(R"DOC(
-assign_pos_op Operator.
-
-Assign pos decides which tokens should be fetched belong to
-specially counter orderingly.
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_WITHOUT_GRADIENT(assign_pos,
-                             ops::AssignPosOp,
-                             ops::AssignPosOpMaker);
diff --git a/paddle/fluid/operators/channel_shuffle_op.cc b/paddle/fluid/operators/channel_shuffle_op.cc
deleted file mode 100644
index 69f75691a0318..0000000000000
--- a/paddle/fluid/operators/channel_shuffle_op.cc
+++ /dev/null
@@ -1,103 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/backward.h"
-#include "paddle/phi/infermeta/unary.h"
-
-namespace paddle {
-namespace operators {
-
-class ChannelShuffleOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-};
-
-class ChannelShuffleOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor, default Tensor<float>), "
-             "the input feature data of ChannelShuffleOp, the layout is "
-             "[N, C, H, W] or [N, H, W, C].");
-    AddOutput("Out",
-              "(Tensor, default Tensor<float>), the output of "
-              "ChannelShuffleOp. The layout is also [N, C, "
-              "H, W] or [N, H, W, C].");
-    AddAttr<int>("groups", "number of groups to divide channels in.");
-    AddAttr<std::string>(
-        "data_format",
-        "An optional string from: \"NHWC\", \"NCHW\". "
-        "Defaults to \"NHWC\", Specify the data format of the input data.")
-        .SetDefault("NCHW");
-
-    AddComment(R"DOC(
-    Channel Shuffle operator
-    This operator divides channels in a tensor of shape :math:`(*, C, H, W)`
-        into :math:`g` groups and rearranges them as :math:`(*, C/g, g, H, W)`
-        while keeping the original tensor shape.
-
-    Please refer to the paper:
-        `ShuffleNet: An Extremely Efficient Convolutional Neural Network for
-        Mobile Devices <https://arxiv.org/abs/1707.01083>`_
-        by Zhang et. al (2017) for more details.
-
-        )DOC");
-  }
-};
-
-class ChannelShuffleGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-};
-
-template <typename T>
-class ChannelShuffleGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("channel_shuffle_grad");
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetAttrMap(this->Attrs());
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-DECLARE_INFER_SHAPE_FUNCTOR(channel_shuffle,
-                            ChannelShuffleInferShapeFunctor,
-                            PD_INFER_META(phi::ChannelShuffleInferMeta));
-
-REGISTER_OPERATOR(channel_shuffle,
-                  ops::ChannelShuffleOp,
-                  ops::ChannelShuffleOpMaker,
-                  ops::ChannelShuffleGradOpMaker<paddle::framework::OpDesc>,
-                  ops::ChannelShuffleGradOpMaker<paddle::imperative::OpBase>,
-                  ChannelShuffleInferShapeFunctor);
-
-DECLARE_INFER_SHAPE_FUNCTOR(channel_shuffle_grad,
-                            ChannelShuffleGradInferShapeFunctor,
-                            PD_INFER_META(phi::ChannelShuffleGradInferMeta));
-
-REGISTER_OPERATOR(channel_shuffle_grad,
-                  ops::ChannelShuffleGradOp,
-                  ChannelShuffleGradInferShapeFunctor);
diff --git a/paddle/fluid/operators/collective/c_allreduce_avg_op.cc b/paddle/fluid/operators/collective/c_allreduce_avg_op.cc
index 963ea26321bdb..13d07557f1e7c 100644
--- a/paddle/fluid/operators/collective/c_allreduce_avg_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_avg_op.cc
@@ -14,17 +14,14 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 
-namespace paddle {
-namespace framework {
+namespace paddle::framework {
 class OpDesc;
-}  // namespace framework
-namespace imperative {
+}  // namespace paddle::framework
+namespace paddle::imperative {
 class OpBase;
-}  // namespace imperative
-}  // namespace paddle
+}  // namespace paddle::imperative
 
-namespace paddle {
-namespace operators {
+namespace paddle::operators {
 
 class CAllReduceAvgOpMaker : public CAllReduceOpMaker {
  protected:
@@ -33,8 +30,7 @@ class CAllReduceAvgOpMaker : public CAllReduceOpMaker {
 
 DECLARE_INPLACE_OP_INFERER(AllreduceAvgInplaceInferer, {"X", "Out"});
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace paddle::operators
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/fluid/operators/collective/partial_send_op.cc b/paddle/fluid/operators/collective/partial_send_op.cc
index cf2a0ece1a7ab..961b8c4cf1382 100644
--- a/paddle/fluid/operators/collective/partial_send_op.cc
+++ b/paddle/fluid/operators/collective/partial_send_op.cc
@@ -14,8 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/partial_send_op.h"
 
-namespace paddle {
-namespace operators {
+namespace paddle::operators {
 
 class PartialSendOp : public framework::OperatorWithKernel {
  public:
@@ -84,8 +83,7 @@ Reference: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/p2p.h
   }
 };
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace paddle::operators
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/fluid/operators/compat/conv2d.pbtxt b/paddle/fluid/operators/compat/conv2d.pbtxt
index b18e026499243..1b602fe43aab1 100644
--- a/paddle/fluid/operators/compat/conv2d.pbtxt
+++ b/paddle/fluid/operators/compat/conv2d.pbtxt
@@ -50,7 +50,7 @@ extra {
   attrs {
     name: "quantization_type"
     type: STRING
-  } 
+  }
   attrs {
     name: "bit_length"
     type: INT
diff --git a/paddle/fluid/operators/compat/conv2d_transpose.pbtxt b/paddle/fluid/operators/compat/conv2d_transpose.pbtxt
index c805547e0143d..ed04ecc4b71ec 100644
--- a/paddle/fluid/operators/compat/conv2d_transpose.pbtxt
+++ b/paddle/fluid/operators/compat/conv2d_transpose.pbtxt
@@ -8,7 +8,7 @@ def {
   }
   inputs {
     name: "Bias"
-  }  
+  }
   outputs {
     name: "Output"
   }
diff --git a/paddle/fluid/operators/compat/conv2d_transpose_bias.pbtxt b/paddle/fluid/operators/compat/conv2d_transpose_bias.pbtxt
index bce4fc9f0e114..93bf29b8b394a 100644
--- a/paddle/fluid/operators/compat/conv2d_transpose_bias.pbtxt
+++ b/paddle/fluid/operators/compat/conv2d_transpose_bias.pbtxt
@@ -8,7 +8,7 @@ def {
   }
   inputs {
     name: "Bias"
-  }  
+  }
   outputs {
     name: "Output"
   }
diff --git a/paddle/fluid/operators/compat/depthwise_conv2d.pbtxt b/paddle/fluid/operators/compat/depthwise_conv2d.pbtxt
index ee04cd73dd70c..a0d80211c2594 100644
--- a/paddle/fluid/operators/compat/depthwise_conv2d.pbtxt
+++ b/paddle/fluid/operators/compat/depthwise_conv2d.pbtxt
@@ -42,7 +42,7 @@ extra {
   attrs {
     name: "quantization_type"
     type: STRING
-  } 
+  }
   attrs {
     name: "bit_length"
     type: INT
diff --git a/paddle/fluid/operators/compat/fused_transpose.pbtxt b/paddle/fluid/operators/compat/fused_transpose.pbtxt
index e4c7c218cc117..677d2e5792f75 100644
--- a/paddle/fluid/operators/compat/fused_transpose.pbtxt
+++ b/paddle/fluid/operators/compat/fused_transpose.pbtxt
@@ -17,26 +17,26 @@ def {
 extra {
   attrs{
     name: "fused_squeeze2_axes"
-    type: INTS        
+    type: INTS
   }
   attrs{
     name: "fused_unsqueeze2_axes"
-    type: INTS        
+    type: INTS
   }
   attrs{
     name: "fused_reshape2_shape"
-    type: INTS        
+    type: INTS
   }
   attrs{
     name: "scale"
-    type: FLOAT        
+    type: FLOAT
   }
   attrs{
     name: "shift"
-    type: FLOAT        
+    type: FLOAT
   }
   attrs{
     name: "output_data_type"
-    type: STRING        
+    type: STRING
   }
 }
diff --git a/paddle/fluid/operators/compat/mul.pbtxt b/paddle/fluid/operators/compat/mul.pbtxt
index 056f799c6c49c..28b40d0e6526c 100644
--- a/paddle/fluid/operators/compat/mul.pbtxt
+++ b/paddle/fluid/operators/compat/mul.pbtxt
@@ -22,7 +22,7 @@ extra {
   attrs {
     name: "Out0_threshold"
     type: FLOAT
-  } 
+  }
   attrs {
     name: "bit_length"
     type: INT
@@ -30,7 +30,7 @@ extra {
   attrs {
     name: "quantization_type"
     type: STRING
-  } 
+  }
   attrs {
     name: "skip_quant"
     type: BOOLEAN
diff --git a/paddle/fluid/operators/compat/sequence_conv.pbtxt b/paddle/fluid/operators/compat/sequence_conv.pbtxt
index c5335a25c557a..679b1095a57ba 100644
--- a/paddle/fluid/operators/compat/sequence_conv.pbtxt
+++ b/paddle/fluid/operators/compat/sequence_conv.pbtxt
@@ -23,7 +23,7 @@ def {
   attrs {
     name: "contextStride"
     type: INT
-   }  
+   }
 }
 extra {
   attrs {
@@ -49,5 +49,5 @@ extra {
   attrs {
     name: "op_device"
     type: STRING
-  }   
+  }
 }
diff --git a/paddle/fluid/operators/custom_device_common_op_registry.cc b/paddle/fluid/operators/custom_device_common_op_registry.cc
index de04cb0e3bba5..ffdb3f01454a2 100644
--- a/paddle/fluid/operators/custom_device_common_op_registry.cc
+++ b/paddle/fluid/operators/custom_device_common_op_registry.cc
@@ -1366,7 +1366,10 @@ void RegisterCustomDeviceCommonKernel(const std::string& dev_type) {
           float>,
       paddle::operators::CConcatOpCustomDeviceKernel<
           paddle::platform::CustomDeviceContext,
-          phi::dtype::float16>);
+          phi::dtype::float16>,
+      paddle::operators::CConcatOpCustomDeviceKernel<
+          paddle::platform::CustomDeviceContext,
+          phi::dtype::bfloat16>);
   REGISTER_OP_CUSTOM_DEVICE_KERNEL(
       c_split,
       device_type,
@@ -1378,7 +1381,10 @@ void RegisterCustomDeviceCommonKernel(const std::string& dev_type) {
           int>,
       paddle::operators::CSplitOpCustomDeviceKernel<
           paddle::platform::CustomDeviceContext,
-          phi::dtype::float16>);
+          phi::dtype::float16>,
+      paddle::operators::CSplitOpCustomDeviceKernel<
+          paddle::platform::CustomDeviceContext,
+          phi::dtype::bfloat16>);
   REGISTER_OP_CUSTOM_DEVICE_KERNEL(
       c_embedding,
       device_type,
diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
index d9bb602338352..8489b9b6c0e28 100755
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -6,7 +6,6 @@ endif()
 register_operators(
   EXCLUDES
   fused_bn_activation_op
-  fusion_group_op
   fusion_lstm_op
   fused_bn_add_activation_op
   fused_attention_op
@@ -38,10 +37,6 @@ if(WITH_GPU OR WITH_ROCM)
   # HIP not support cudnnTransformTensor
   # HIP not support cudnnConvolutionBiasActivationForward
   op_library(fused_gate_attention_op)
-  # fusion_group
-  if(NOT APPLE AND NOT WIN32)
-    op_library(fusion_group_op)
-  endif()
   # fused_bn_add_activation
   # HIP not support bn act fuse in MIOPEN
   if((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 7401))
diff --git a/paddle/fluid/operators/fused/fusion_group_op.cc b/paddle/fluid/operators/fused/fusion_group_op.cc
deleted file mode 100644
index b42dd927c6e31..0000000000000
--- a/paddle/fluid/operators/fused/fusion_group_op.cc
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/infermeta/multiary.h"
-
-namespace paddle {
-namespace operators {
-
-class FusionGroupOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(framework::proto::VarType::FP32, phi::GPUPlace(0));
-  };
-};
-
-class FusionGroupOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Inputs",
-             "(std::vector<phi::DenseTensor>) The inputs of fusion_group op.")
-        .AsDuplicable();
-    AddOutput("Outs",
-              "(std::vector<phi::DenseTensor>) The outputs of fusion_group op.")
-        .AsDuplicable();
-    AddAttr<std::vector<int>>("outs_dtype",
-                              "The data type of Outputs in fusion_group op.")
-        .SetDefault({});
-    AddAttr<std::vector<int>>("inputs_dtype",
-                              "The data type of Inputs in fusion_group op.")
-        .SetDefault({});
-    AddAttr<int>("type", "Fusion type.").SetDefault(0);
-    AddAttr<std::string>("func_name", "Name of the generated functions.")
-        .SetDefault("");
-    AddComment(R"DOC(
-fusion_group Operator.
-
-It is used to execute a generated CUDA kernel which fuse the computation of
-multiple operators into one. It supports several types:
-0, fused computation of elementwise operations in which all the dims of inputs
-    and outputs should be exactly the same.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-DECLARE_INFER_SHAPE_FUNCTOR(fusion_group,
-                            FusionGroupInferShapeFunctor,
-                            PD_INFER_META(phi::FusionGroupInferMeta));
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(fusion_group,
-                  ops::FusionGroupOp,
-                  ops::FusionGroupOpMaker,
-                  FusionGroupInferShapeFunctor);
diff --git a/paddle/fluid/operators/lstm_op.cc b/paddle/fluid/operators/lstm_op.cc
deleted file mode 100644
index ac5cb81c060f0..0000000000000
--- a/paddle/fluid/operators/lstm_op.cc
+++ /dev/null
@@ -1,365 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/lstm_op.h"
-
-#include <memory>
-#include <string>
-
-namespace paddle {
-namespace operators {
-
-class LSTMOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "LSTM");
-    OP_INOUT_CHECK(ctx->HasInput("Weight"), "Input", "Weight", "LSTM");
-    OP_INOUT_CHECK(ctx->HasInput("Bias"), "Input", "Bias", "LSTM");
-
-    OP_INOUT_CHECK(ctx->HasOutput("Hidden"), "Output", "Hidden", "LSTM");
-    OP_INOUT_CHECK(ctx->HasOutput("Cell"), "Output", "Cell", "LSTM");
-
-    bool is_test = ctx->Attrs().Get<bool>("is_test");
-
-    if (!is_test) {
-      OP_INOUT_CHECK(
-          ctx->HasOutput("BatchGate"), "Output", "BatchGate", "LSTM");
-      OP_INOUT_CHECK(ctx->HasOutput("BatchCellPreAct"),
-                     "Output",
-                     "BatchCellPreAct",
-                     "LSTM");
-    }
-    auto in_dims = ctx->GetInputDim("Input");
-    PADDLE_ENFORCE_EQ(
-        in_dims.size(),
-        2,
-        phi::errors::InvalidArgument(
-            "Input(X)'s rank must be 2, but received %d.", in_dims.size()));
-
-    if (ctx->HasInput("H0")) {
-      PADDLE_ENFORCE_EQ(
-          ctx->HasInput("C0"),
-          true,
-          phi::errors::NotFound("Input(Cell) and Input(Hidden) of LSTM "
-                                "should not be null at the same time."));
-      auto h_dims = ctx->GetInputDim("H0");
-      auto c_dims = ctx->GetInputDim("C0");
-      PADDLE_ENFORCE_EQ(h_dims,
-                        c_dims,
-                        phi::errors::InvalidArgument(
-                            "The dimension of Input(H0) and Input(C0) should "
-                            "be the same, but received [%s] (H0) vs [%s] (C0).",
-                            h_dims,
-                            c_dims));
-    }
-
-    int frame_size = static_cast<int>(in_dims[1] / 4);
-    auto w_dims = ctx->GetInputDim("Weight");
-    PADDLE_ENFORCE_EQ(
-        w_dims.size(),
-        2,
-        phi::errors::InvalidArgument(
-            "The rank of Input(Weight) should be 2, but received %d.",
-            w_dims.size()));
-    PADDLE_ENFORCE_EQ(w_dims[0],
-                      frame_size,
-                      phi::errors::InvalidArgument(
-                          "The first dimension of Input(Weight) should be %d, "
-                          "but received %d.",
-                          frame_size,
-                          w_dims[0]));
-    PADDLE_ENFORCE_EQ(w_dims[1],
-                      4 * frame_size,
-                      phi::errors::InvalidArgument(
-                          "The second dimension of Input(Weight) should be 4 * "
-                          "%d, but received %d.",
-                          frame_size,
-                          w_dims[1]));
-
-    auto b_dims = ctx->GetInputDim("Bias");
-    PADDLE_ENFORCE_EQ(
-        b_dims.size(),
-        2,
-        phi::errors::InvalidArgument(
-            "The rank of Input(Bias) should be 2, but received %d.",
-            b_dims.size()));
-    PADDLE_ENFORCE_EQ(
-        b_dims[0],
-        1,
-        phi::errors::InvalidArgument(
-            "The first dimension of Input(Bias) should be 1, but received %d.",
-            b_dims[0]));
-
-    if (ctx->Attrs().Get<bool>("use_peepholes")) {
-      PADDLE_ENFORCE_EQ(
-          b_dims[1],
-          7 * frame_size,
-          phi::errors::InvalidArgument(
-              "The second dimension of Input(Bias) should be 7 * %d if enable "
-              "peepholes connection, but received %d.",
-              frame_size,
-              b_dims[1]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          b_dims[1],
-          4 * frame_size,
-          phi::errors::InvalidArgument(
-              "The second dimension of Input(Bias) should be 4 * %d if disable "
-              "peepholes connection, but received %d.",
-              frame_size,
-              b_dims[1]));
-    }
-
-    phi::DDim out_dims({in_dims[0], frame_size});
-    ctx->SetOutputDim("Hidden", out_dims);
-    ctx->SetOutputDim("Cell", out_dims);
-    if (!is_test) {
-      ctx->SetOutputDim("BatchGate", in_dims);
-      ctx->SetOutputDim("BatchCellPreAct", out_dims);
-    }
-    ctx->ShareLoD("Input", "Hidden");
-    ctx->ShareLoD("Input", "Cell");
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "Input"),
-                          ctx.device_context().GetPlace());
-  }
-};
-
-class LSTMOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(
-        "Input",
-        "(phi::DenseTensor) the first input is a phi::DenseTensor, which "
-        "support variable-time length input sequence. The underlying tensor in "
-        "this phi::DenseTensor is a matrix with shape (T X 4D), where T is the "
-        "total time steps in this mini-batch, D is the hidden size.");
-    AddInput("H0",
-             "(Tensor, optional) the initial hidden state is an optional "
-             "input. This is a tensor with shape (N x D), where N is the "
-             "batch size and D is the hidden size.")
-        .AsDispensable();
-    AddInput("C0",
-             "(Tensor, optional) the initial cell state is an optional "
-             "input. This is a tensor with shape (N x D), where N is the "
-             "batch size. `H0` and `C0` can be NULL but only at the same time.")
-        .AsDispensable();
-    AddInput("Weight",
-             "(Tensor) the learnable hidden-hidden weights."
-             " - The shape is (D x 4D), where D is the hidden size. "
-             " - Weight = {W_ch, W_ih, W_fh, W_oh}");
-    AddInput("Bias",
-             "(Tensor) the learnable weights, which contains two parts: "
-             "input-hidden bias weight and peephole connections weight if "
-             "setting `use_peepholes` True. "
-             "1. `use_peepholes = False` "
-             " - The shape is (1 x 4D). "
-             " - Bias = {b_c, b_i, b_f, b_o}."
-             "2. `use_peepholes = True` "
-             " - The shape is (1 x 7D). "
-             " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}.");
-    AddOutput("Hidden",
-              "(phi::DenseTensor) the hidden state of LSTM operator. "
-              "The shape is (T x D), and lod is the same with the `Input`.");
-    AddOutput("Cell",
-              "(phi::DenseTensor) the cell state of LSTM operator. "
-              "The shape is (T x D), and lod is the same with the `Input`.");
-    AddOutput(
-        "BatchGate",
-        "(phi::DenseTensor) This phi::DenseTensor contains input gate, forget "
-        "gate "
-        "and output gate after the nonlinear computation. This "
-        "phi::DenseTensor has the same shape as the reorganized input, which "
-        "is also be called batch input. The LoD size is 2. The first "
-        "LoD is the batch offsets and the second LoD contains the "
-        "indexes, which denote the position of reorganized sequence "
-        "in the raw input.")
-        .AsIntermediate()
-        .AsExtra();
-    AddOutput("BatchCellPreAct",
-              "(phi::DenseTensor) This phi::DenseTensor is obtained in the "
-              "forward and used "
-              "in the backward.")
-        .AsIntermediate()
-        .AsExtra();
-    AddAttr<bool>("use_peepholes",
-                  "(bool, default: True) "
-                  "whether to enable diagonal/peephole connections.")
-        .SetDefault(true);
-    AddAttr<bool>("is_reverse",
-                  "(bool, default: False) "
-                  "whether to compute reversed LSTM.")
-        .SetDefault(false);
-    AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
-    AddAttr<std::string>(
-        "gate_activation",
-        "(string, default: sigmoid)"
-        "The activation for input gate, forget gate and output "
-        "gate, `sigmoid` by default.")
-        .SetDefault("sigmoid")
-        .InEnum({"sigmoid", "tanh", "relu", "identity"});
-    AddAttr<std::string>("cell_activation",
-                         "(string, default: tanh)"
-                         "The activation for cell output, `tanh` by default.")
-        .SetDefault("tanh")
-        .InEnum({"sigmoid", "tanh", "relu", "identity"});
-    AddAttr<std::string>("candidate_activation",
-                         "(string, default: tanh)"
-                         "The activation for candidate hidden state, "
-                         "`tanh` by default.")
-        .SetDefault("tanh")
-        .InEnum({"sigmoid", "tanh", "relu", "identity"});
-    AddComment(R"DOC(
-Long-Short Term Memory (LSTM) Operator.
-
-The default implementation is diagonal/peephole connection
-(https://arxiv.org/pdf/1402.1128.pdf), the formula is as follows:
-
-$$ i_t = \\sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + W_{ic}c_{t-1} + b_i) $$
-
-$$ f_t = \\sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + W_{fc}c_{t-1} + b_f) $$
-
-$$ \\tilde{c_t} = act_g(W_{cx}x_t + W_{ch}h_{t-1} + b_c) $$
-
-$$ o_t = \\sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + W_{oc}c_t + b_o) $$
-
-$$ c_t = f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} $$
-
-$$ h_t = o_t \\odot act_h(c_t) $$
-
-- W terms denote weight matrices (e.g. $W_{xi}$ is the matrix
-  of weights from the input gate to the input), $W_{ic}, W_{fc}, W_{oc}$
-  are diagonal weight matrices for peephole connections. In our implementation,
-  we use vectors to represent these diagonal weight matrices.
-- The b terms denote bias vectors ($b_i$ is the input gate bias vector).
-- $\sigma$ is the non-line activations, such as logistic sigmoid function.
-- $i, f, o$ and $c$ are the input gate, forget gate, output gate,
-  and cell activation vectors, respectively, all of which have the same size as
-  the cell output activation vector $h$.
-- The $\odot$ is the element-wise product of the vectors.
-- $act_g$ and $act_h$ are the cell input and cell output activation functions
-  and `tanh` is usually used for them.
-- $\tilde{c_t}$ is also called candidate hidden state,
-  which is computed based on the current input and the previous hidden state.
-
-Set `use_peepholes` False to disable peephole connection. The formula
-is omitted here, please refer to the paper
-http://www.bioinf.jku.at/publications/older/2604.pdf for details.
-
-Note that these $W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}$
-operations on the input $x_{t}$ are NOT included in this operator.
-Users can choose to use fully-connect operator before LSTM operator.
-
-)DOC");
-  }
-};
-
-class LSTMGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "LSTM@Grad");
-    OP_INOUT_CHECK(ctx->HasInput("Hidden"), "Input", "Hidden", "LSTM@Grad");
-    OP_INOUT_CHECK(ctx->HasInput("Cell"), "Input", "Cell", "LSTM@Grad");
-    OP_INOUT_CHECK(ctx->HasInput("Weight"), "Input", "Weight", "LSTM@Grad");
-    OP_INOUT_CHECK(ctx->HasInput("Bias"), "Input", "Bias", "LSTM@Grad");
-
-    OP_INOUT_CHECK(
-        ctx->HasInput("BatchGate"), "Input", "BatchGate", "LSTM@Grad");
-    OP_INOUT_CHECK(ctx->HasInput("BatchCellPreAct"),
-                   "Input",
-                   "BatchCellPreAct",
-                   "LSTM@Grad");
-
-    auto SetOutGradDim = [&ctx](const std::string& name) {
-      auto g_name = framework::GradVarName(name);
-      if (ctx->HasOutput(g_name))
-        ctx->SetOutputDim(g_name, ctx->GetInputDim(name));
-    };
-
-    SetOutGradDim("Input");
-    SetOutGradDim("Weight");
-    SetOutGradDim("Bias");
-    SetOutGradDim("H0");
-    SetOutGradDim("C0");
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "Input"),
-                          ctx.device_context().GetPlace());
-  }
-};
-
-template <typename T>
-class LSTMGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("lstm_grad");
-    op->SetAttrMap(this->Attrs());
-    op->SetInput("Input", this->Input("Input"));
-    op->SetOutput(framework::GradVarName("Input"), this->InputGrad("Input"));
-
-    if (this->HasInput("H0")) {
-      op->SetInput("H0", this->Input("H0"));
-      op->SetOutput(framework::GradVarName("H0"), this->InputGrad("H0"));
-    }
-
-    if (this->HasInput("C0")) {
-      op->SetInput("C0", this->Input("C0"));
-      op->SetOutput(framework::GradVarName("C0"), this->InputGrad("C0"));
-    }
-
-    op->SetInput("Weight", this->Input("Weight"));
-    op->SetOutput(framework::GradVarName("Weight"), this->InputGrad("Weight"));
-
-    op->SetInput("Bias", this->Input("Bias"));
-    op->SetOutput(framework::GradVarName("Bias"), this->InputGrad("Bias"));
-
-    op->SetInput("Cell", this->Output("Cell"));
-
-    op->SetInput("Hidden", this->Output("Hidden"));
-    op->SetInput(framework::GradVarName("Hidden"), this->OutputGrad("Hidden"));
-
-    op->SetInput("BatchGate", this->Output("BatchGate"));
-    op->SetInput("BatchCellPreAct", this->Output("BatchCellPreAct"));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(lstm,
-                  ops::LSTMOp,
-                  ops::LSTMOpMaker,
-                  ops::LSTMGradOpMaker<paddle::framework::OpDesc>,
-                  ops::LSTMGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(lstm_grad, ops::LSTMGradOp);
-
-PD_REGISTER_STRUCT_KERNEL(
-    lstm, CPU, ALL_LAYOUT, ops::LSTMKernel, float, double) {}
-PD_REGISTER_STRUCT_KERNEL(
-    lstm_grad, CPU, ALL_LAYOUT, ops::LSTMGradKernel, float, double) {}
diff --git a/paddle/fluid/operators/lstm_op.cu.cc b/paddle/fluid/operators/lstm_op.cu.cc
deleted file mode 100644
index b06521088a95a..0000000000000
--- a/paddle/fluid/operators/lstm_op.cu.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/lstm_op.h"
-
-namespace ops = paddle::operators;
-PD_REGISTER_STRUCT_KERNEL(
-    lstm, GPU, ALL_LAYOUT, ops::LSTMKernel, float, double) {}
-PD_REGISTER_STRUCT_KERNEL(
-    lstm_grad, GPU, ALL_LAYOUT, ops::LSTMGradKernel, float, double) {}
diff --git a/paddle/fluid/operators/lstm_op.h b/paddle/fluid/operators/lstm_op.h
deleted file mode 100644
index 9eaba45a2d597..0000000000000
--- a/paddle/fluid/operators/lstm_op.h
+++ /dev/null
@@ -1,444 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/funcs/detail/activation_functions.h"
-#include "paddle/phi/kernels/funcs/lstm_compute.h"
-#include "paddle/phi/kernels/funcs/sequence2batch.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-inline void ReorderInitState(const DeviceContext& ctx,
-                             const phi::DenseTensor& src,
-                             phi::Vector<size_t> index_lod,
-                             phi::DenseTensor* dst,
-                             bool indexed_src) {
-  phi::funcs::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
-  dst->mutable_data<T>(src.dims(), ctx.GetPlace());
-  row_shuffle(ctx, src, index_lod, dst, indexed_src);
-}
-
-template <typename T, typename DeviceContext>
-class LSTMKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    bool is_test = ctx.Attr<bool>("is_test");
-
-    auto* input = ctx.Input<phi::DenseTensor>("Input");
-    auto* weight = ctx.Input<phi::DenseTensor>("Weight");
-    auto* bias = ctx.Input<phi::DenseTensor>("Bias");
-
-    auto* hidden_t0 = ctx.Input<phi::DenseTensor>("H0");
-    auto* cell_t0 = ctx.Input<phi::DenseTensor>("C0");
-
-    phi::DenseTensor* batch_gate = nullptr;
-    phi::DenseTensor batch_gate_temp;
-    if (is_test) {
-      batch_gate = &batch_gate_temp;
-      batch_gate->Resize(input->dims());
-    } else {
-      batch_gate = ctx.Output<phi::DenseTensor>("BatchGate");
-    }
-    batch_gate->mutable_data<T>(ctx.GetPlace());
-    auto* hidden_out = ctx.Output<phi::DenseTensor>("Hidden");
-    hidden_out->mutable_data<T>(ctx.GetPlace());
-    auto* cell_out = ctx.Output<phi::DenseTensor>("Cell");
-    cell_out->mutable_data<T>(ctx.GetPlace());
-
-    bool is_reverse = ctx.Attr<bool>("is_reverse");
-    phi::funcs::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
-    auto& device_ctx = ctx.template device_context<DeviceContext>();
-    to_batch(device_ctx, *input, batch_gate, true, is_reverse);
-
-    auto in_dims = input->dims();
-    int frame_size = static_cast<int>(in_dims[1] / 4);
-    phi::DDim dims({in_dims[0], frame_size});
-
-    if (bias) {
-      phi::DenseTensor b = *bias;
-      b.Resize({bias->numel(), 1});
-      phi::DenseTensor gate_bias = b.Slice(0, 4 * frame_size);
-      phi::funcs::RowwiseAdd<DeviceContext, T> add_bias;
-      add_bias(device_ctx, *batch_gate, gate_bias, batch_gate);
-    }
-
-    phi::funcs::LstmMetaValue<T> lstm_value;
-    if (bias && ctx.Attr<bool>("use_peepholes")) {
-      T* bias_data = const_cast<T*>(bias->data<T>());
-      // the code style in LstmMetaValue will be updated later.
-
-      lstm_value.check_ig = bias_data + 4 * frame_size;
-      lstm_value.check_fg = lstm_value.check_ig + frame_size;
-      lstm_value.check_og = lstm_value.check_fg + frame_size;
-    } else {
-      lstm_value.check_ig = nullptr;
-      lstm_value.check_fg = nullptr;
-      lstm_value.check_og = nullptr;
-    }
-    lstm_value.prev_state_value = nullptr;
-    phi::DenseTensor ordered_c0;
-
-    phi::Vector<size_t> order(batch_gate->lod()[2]);
-
-    if (cell_t0) {
-      // Since the batch computing for LSTM reorders the input sequence
-      // according to their length. The initialized cell state also needs
-      // to reorder.
-      ReorderInitState<DeviceContext, T>(
-          device_ctx, *cell_t0, order, &ordered_c0, true);
-      lstm_value.prev_state_value = ordered_c0.data<T>();
-    }
-
-    // Use the local variable as here.
-    phi::DenseTensor batch_hidden, batch_cell, batch_cell_pre_act_temp;
-    phi::DenseTensor* batch_cell_pre_act;
-    if (is_test) {
-      batch_cell_pre_act = &batch_cell_pre_act_temp;
-    } else {
-      batch_cell_pre_act = ctx.Output<phi::DenseTensor>("BatchCellPreAct");
-    }
-    batch_hidden.mutable_data<T>(dims, ctx.GetPlace());
-    batch_cell.mutable_data<T>(dims, ctx.GetPlace());
-    batch_cell_pre_act->mutable_data<T>(dims, ctx.GetPlace());
-
-    auto batch_starts = batch_gate->lod()[0];
-    size_t num_batch = batch_starts.size() - 1;
-    auto gate_act = phi::funcs::detail::GetActivationType(
-        ctx.Attr<std::string>("gate_activation"));
-    auto cell_act = phi::funcs::detail::GetActivationType(
-        ctx.Attr<std::string>("cell_activation"));
-    auto cand_act = phi::funcs::detail::GetActivationType(
-        ctx.Attr<std::string>("candidate_activation"));
-
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(device_ctx);
-    for (size_t n = 0; n < num_batch; n++) {
-      int bstart = static_cast<int>(batch_starts[n]);
-      int bend = static_cast<int>(batch_starts[n + 1]);
-
-      phi::DenseTensor gate_t = batch_gate->Slice(bstart, bend);
-      phi::DenseTensor out_t = batch_hidden.Slice(bstart, bend);
-      phi::DenseTensor cell_t = batch_cell.Slice(bstart, bend);
-      phi::DenseTensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend);
-
-      int cur_batch_size = bend - bstart;
-
-      if (n > 0) {
-        int pre_h_start = static_cast<int>(batch_starts[n - 1]);
-        int pre_h_end = pre_h_start + cur_batch_size;
-        auto pre_hidden_t = batch_hidden.Slice(pre_h_start, pre_h_end);
-        blas.MatMul(pre_hidden_t,
-                    false,
-                    *weight,
-                    false,
-                    static_cast<T>(1.0),
-                    &gate_t,
-                    static_cast<T>(1.0));
-      } else if (hidden_t0) {
-        // If n == 0 and there is no initialized hidden state, that is to say
-        // the H0 is zeros, the calculation W_h * H0 will be skiped.
-        // If n == 0 and there is initialized hidden state, calculate W_h * H0.
-
-        // Since the batch computing for LSTM reorders the input sequence
-        // according to their length. The initialized hidden state also needs
-        // to reorder.
-        phi::DenseTensor ordered_h0;
-        ReorderInitState<DeviceContext, T>(
-            device_ctx, *hidden_t0, order, &ordered_h0, true);
-        blas.MatMul(ordered_h0,
-                    false,
-                    *weight,
-                    false,
-                    static_cast<T>(1.0),
-                    &gate_t,
-                    static_cast<T>(1.0));
-      }
-
-      lstm_value.gate_value = gate_t.data<T>();
-      lstm_value.output_value = out_t.data<T>();
-      lstm_value.state_value = cell_t.data<T>();
-      lstm_value.state_active_value = cell_pre_act_t.data<T>();
-      T cell_clip = 0.0;
-      phi::funcs::LstmUnitFunctor<DeviceContext, T>::compute(device_ctx,
-                                                             lstm_value,
-                                                             frame_size,
-                                                             cur_batch_size,
-                                                             cell_clip,
-                                                             gate_act,
-                                                             cell_act,
-                                                             cand_act);
-      lstm_value.prev_state_value = lstm_value.state_value;
-    }
-
-    phi::funcs::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
-    batch_hidden.set_lod(batch_gate->lod());
-    // restore the output hidden in phi::DenseTensor from the batch hidden
-    to_seq(device_ctx, batch_hidden, hidden_out);
-
-    batch_cell.set_lod(batch_gate->lod());
-    // restore the output cell state in phi::DenseTensor from the batch cell
-    to_seq(device_ctx, batch_cell, cell_out);
-  }
-};
-
-template <typename T, typename DeviceContext>
-class LSTMGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("Input");
-    auto* weight = ctx.Input<phi::DenseTensor>("Weight");
-    auto* bias = ctx.Input<phi::DenseTensor>("Bias");
-
-    auto* hidden_out = ctx.Input<phi::DenseTensor>("Hidden");
-    auto* cell_out = ctx.Input<phi::DenseTensor>("Cell");
-
-    auto* batch_gate = ctx.Input<phi::DenseTensor>("BatchGate");
-    auto* batch_cell_pre_act = ctx.Input<phi::DenseTensor>("BatchCellPreAct");
-
-    auto* hidden_g =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Hidden"));
-
-    auto* in_g = ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
-    auto* weight_g =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Weight"));
-    auto* bias_g = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
-
-    auto* h0 = ctx.Input<phi::DenseTensor>("H0");
-    auto* c0 = ctx.Input<phi::DenseTensor>("C0");
-
-    auto* h0_g = ctx.Output<phi::DenseTensor>(framework::GradVarName("H0"));
-    auto* c0_g = ctx.Output<phi::DenseTensor>(framework::GradVarName("C0"));
-
-    auto& device_ctx = ctx.template device_context<DeviceContext>();
-    phi::funcs::SetConstant<DeviceContext, T> zero;
-    if (weight_g) {
-      weight_g->mutable_data<T>(ctx.GetPlace());
-      zero(device_ctx, weight_g, static_cast<T>(0.0));
-    }
-
-    // ordered_h0/c0 is the reordered hidden/cell initialization.
-    // ordered_h0_g/c0_g is the reordered gradient of hidden/cell
-    // initialization.
-    phi::DenseTensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g;
-    phi::Vector<size_t> order(batch_gate->lod()[2]);
-
-    if (c0) {
-      ReorderInitState<DeviceContext, T>(
-          device_ctx, *c0, order, &ordered_c0, true);
-    }
-    if (c0 && c0_g) {
-      ordered_c0_g.mutable_data<T>(c0_g->dims(), ctx.GetPlace());
-    }
-
-    auto in_dims = input->dims();
-    auto out_dims = hidden_g->dims();
-    int frame_size = static_cast<int>(in_dims[1] / 4);
-    PADDLE_ENFORCE_EQ(
-        frame_size,
-        out_dims[1],
-        phi::errors::InvalidArgument(
-            "The second dimension of Input(" +
-                framework::GradVarName("Hidden") +
-                ") should be %d, but received %d in LSTM@Grad operator.",
-            frame_size,
-            out_dims[1]));
-
-    phi::funcs::LstmMetaValue<T> lstm_value;
-    if (bias && ctx.Attr<bool>("use_peepholes")) {
-      T* bias_data = const_cast<T*>(bias->data<T>());
-      lstm_value.check_ig = bias_data + 4 * frame_size;
-      lstm_value.check_fg = lstm_value.check_ig + frame_size;
-      lstm_value.check_og = lstm_value.check_fg + frame_size;
-    } else {
-      lstm_value.check_ig = nullptr;
-      lstm_value.check_fg = nullptr;
-      lstm_value.check_og = nullptr;
-    }
-
-    phi::funcs::LstmMetaGrad<T> lstm_grad;
-
-    if (bias && bias_g) {
-      bias_g->mutable_data<T>(ctx.GetPlace());
-      zero(device_ctx, bias_g, static_cast<T>(0.0));
-    }
-    if (bias && bias_g && ctx.Attr<bool>("use_peepholes")) {
-      T* bias_g_data = bias_g->data<T>();
-      lstm_grad.check_ig_grad = bias_g_data + 4 * frame_size;
-      lstm_grad.check_fg_grad = lstm_grad.check_ig_grad + frame_size;
-      lstm_grad.check_og_grad = lstm_grad.check_fg_grad + frame_size;
-    } else {
-      lstm_grad.check_ig_grad = nullptr;
-      lstm_grad.check_fg_grad = nullptr;
-      lstm_grad.check_og_grad = nullptr;
-    }
-
-    phi::funcs::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
-
-    auto ToBatch = [&batch_gate, &to_batch](const DeviceContext& ctx,
-                                            const phi::DenseTensor& src,
-                                            const phi::DDim& dims,
-                                            phi::DenseTensor& dst) {
-      dst.mutable_data<T>(dims, ctx.GetPlace());
-      dst.set_lod(batch_gate->lod());
-      to_batch(ctx, src, &dst, false);
-    };
-
-    phi::DenseTensor batch_hidden, batch_hidden_g, batch_cell;
-    ToBatch(device_ctx, *hidden_out, out_dims, batch_hidden);
-    ToBatch(device_ctx, *hidden_g, out_dims, batch_hidden_g);
-    ToBatch(device_ctx, *cell_out, out_dims, batch_cell);
-
-    phi::DenseTensor batch_cell_g, batch_gate_g;
-    batch_cell_g.mutable_data<T>(out_dims, ctx.GetPlace());
-    // TODO(qingqing) support the case output cell has gradient.
-    // to_batch(device_ctx, *cell_g, batch_cell_g, false);
-    zero(device_ctx, &batch_cell_g, static_cast<T>(0.0));
-    batch_gate_g.mutable_data<T>(batch_gate->dims(), ctx.GetPlace());
-    batch_gate_g.set_lod(batch_gate->lod());
-
-    auto gate_act = phi::funcs::detail::GetActivationType(
-        ctx.Attr<std::string>("gate_activation"));
-    auto cell_act = phi::funcs::detail::GetActivationType(
-        ctx.Attr<std::string>("cell_activation"));
-    auto cand_act = phi::funcs::detail::GetActivationType(
-        ctx.Attr<std::string>("candidate_activation"));
-
-    auto batch_starts = batch_gate->lod()[0];
-    size_t num_batch = batch_starts.size() - 1;
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(device_ctx);
-    for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) {
-      int bstart = static_cast<int>(batch_starts[n]);
-      int bend = static_cast<int>(batch_starts[n + 1]);
-
-      phi::DenseTensor gate = batch_gate->Slice(bstart, bend);
-      phi::DenseTensor cell = batch_cell.Slice(bstart, bend);
-      phi::DenseTensor cell_pre_act = batch_cell_pre_act->Slice(bstart, bend);
-      lstm_value.gate_value = gate.data<T>();
-      lstm_value.state_value = cell.data<T>();
-      lstm_value.state_active_value = cell_pre_act.data<T>();
-
-      phi::DenseTensor out_g = batch_hidden_g.Slice(bstart, bend);
-      phi::DenseTensor gate_g = batch_gate_g.Slice(bstart, bend);
-      phi::DenseTensor cell_g = batch_cell_g.Slice(bstart, bend);
-      lstm_grad.state_grad = cell_g.data<T>();
-      lstm_grad.gate_grad = gate_g.data<T>();
-      lstm_grad.output_grad = out_g.data<T>();
-
-      if (n > 0) {
-        int bstart_pre = static_cast<int>(batch_starts[n - 1]);
-        phi::DenseTensor cell_pre = batch_cell.Slice(bstart_pre, bstart);
-        phi::DenseTensor cell_pre_g = batch_cell_g.Slice(bstart_pre, bstart);
-        lstm_value.prev_state_value = cell_pre.data<T>();
-        lstm_grad.prev_state_grad = cell_pre_g.data<T>();
-      } else {
-        lstm_value.prev_state_value = c0 ? ordered_c0.data<T>() : nullptr;
-        lstm_grad.prev_state_grad = c0_g ? ordered_c0_g.data<T>() : nullptr;
-      }
-
-      // lstm_value.output_value not used in bp, set to nullptr
-      // lstm_grad.state_active_grad not used in bp, set to nullptr
-      lstm_value.output_value = nullptr;
-      lstm_grad.state_active_grad = nullptr;
-      int cur_batch_size = bend - bstart;
-      T cell_clip = 0.0;
-      phi::funcs::LstmUnitGradFunctor<DeviceContext, T>::compute(device_ctx,
-                                                                 lstm_value,
-                                                                 lstm_grad,
-                                                                 frame_size,
-                                                                 cur_batch_size,
-                                                                 cell_clip,
-                                                                 gate_act,
-                                                                 cell_act,
-                                                                 cand_act);
-
-      if (n > 0) {
-        int pre_h_start = static_cast<int>(batch_starts[n - 1]);
-        int pre_h_end = pre_h_start + cur_batch_size;
-        auto pre_hidden_g = batch_hidden_g.Slice(pre_h_start, pre_h_end);
-        blas.MatMul(gate_g,
-                    false,
-                    *weight,
-                    true,
-                    static_cast<T>(1.0),
-                    &pre_hidden_g,
-                    static_cast<T>(1.0));
-        if (weight_g) {
-          /* backward weight */
-          auto pre_hidden = batch_hidden.Slice(pre_h_start, pre_h_end);
-          blas.MatMul(pre_hidden,
-                      true,
-                      gate_g,
-                      false,
-                      static_cast<T>(1.0),
-                      weight_g,
-                      static_cast<T>(1.0));
-        }
-      } else {
-        if (h0 && weight_g) {
-          ReorderInitState<DeviceContext, T>(
-              device_ctx, *h0, order, &ordered_h0, true);
-          blas.MatMul(ordered_h0,
-                      true,
-                      gate_g,
-                      false,
-                      static_cast<T>(1.0),
-                      weight_g,
-                      static_cast<T>(1.0));
-        }
-        if (h0 && h0_g) {
-          ordered_h0_g.mutable_data<T>(h0_g->dims(), ctx.GetPlace());
-          blas.MatMul(gate_g,
-                      false,
-                      *weight,
-                      true,
-                      static_cast<T>(1.0),
-                      &ordered_h0_g,
-                      static_cast<T>(0.0));
-        }
-      }
-    }
-
-    phi::funcs::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
-    if (in_g) {
-      /* backward data */
-      in_g->mutable_data<T>(ctx.GetPlace());
-      to_seq(device_ctx, batch_gate_g, in_g);
-    }
-    if (bias && bias_g) {
-      /* backward bias */
-      phi::DenseTensor b_g = *bias_g;
-      b_g.Resize({bias_g->numel(), 1});
-      phi::DenseTensor gate_bias_g = b_g.Slice(0, 4 * frame_size);
-      phi::funcs::ColwiseSum<DeviceContext, T> col_sum;
-      col_sum(device_ctx, batch_gate_g, &gate_bias_g);
-    }
-
-    if (h0 && h0_g) {
-      ReorderInitState<DeviceContext, T>(
-          device_ctx, ordered_h0_g, order, h0_g, false);
-    }
-    if (c0 && c0_g) {
-      ReorderInitState<DeviceContext, T>(
-          device_ctx, ordered_c0_g, order, c0_g, false);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/number_count_op.cc b/paddle/fluid/operators/number_count_op.cc
deleted file mode 100644
index 7fb293891d3a5..0000000000000
--- a/paddle/fluid/operators/number_count_op.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/number_count_op.h"
-
-namespace paddle {
-namespace operators {
-
-class NumberCountOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("numbers"), "Input", "numbers", "NumberCount");
-    OP_INOUT_CHECK(
-        ctx->HasOutput("Out"), "Output", "number_count", "NumberCount");
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    // the dtype of the numbers should be same as int64
-    auto number_dtype = OperatorWithKernel::IndicateVarDataType(ctx, "numbers");
-
-    PADDLE_ENFORCE_EQ(number_dtype,
-                      framework::proto::VarType::INT64,
-                      phi::errors::InvalidArgument(
-                          "The dtype of the number_dtype should be int64"));
-    return phi::KernelKey(number_dtype, ctx.GetPlace());
-  }
-};
-
-class NumberCountOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("numbers", "(Tensor) The input gate index tensor.");
-    AddOutput("Out", "(Tensor) The output number count tensor.");
-    AddAttr<int>("upper_range", "(int), The number of different numbers.");
-
-    AddComment(R"DOC(number_count Operator.count numbers.)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_WITHOUT_GRADIENT(number_count,
-                             ops::NumberCountOp,
-                             ops::NumberCountOpMaker);
diff --git a/paddle/fluid/operators/number_count_op.h b/paddle/fluid/operators/number_count_op.h
deleted file mode 100644
index 12ad10c3e73cc..0000000000000
--- a/paddle/fluid/operators/number_count_op.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-#if defined(PADDLE_WITH_GLOO)
-#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename DeviceContext>
-class NumberCountOpCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {}
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/ops_signature/assign_pos_sig.cc b/paddle/fluid/operators/ops_signature/assign_pos_sig.cc
deleted file mode 100644
index 010d164d83dae..0000000000000
--- a/paddle/fluid/operators/ops_signature/assign_pos_sig.cc
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature AssignPosOpArgumentMapping(
-    const ArgumentMappingContext& ctx UNUSED) {
-  return KernelSignature(
-      "assign_pos", {"X", "cum_count", "eff_num_len"}, {}, {"Out"});
-}
-
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(assign_pos, phi::AssignPosOpArgumentMapping);
diff --git a/paddle/fluid/operators/ops_signature/decayed_adagrad_sig.cc b/paddle/fluid/operators/ops_signature/decayed_adagrad_sig.cc
deleted file mode 100644
index d622a8a342789..0000000000000
--- a/paddle/fluid/operators/ops_signature/decayed_adagrad_sig.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature DecayedAdagradOpArgumentMapping(
-    const ArgumentMappingContext& ctx UNUSED) {
-  return KernelSignature("decayed_adagrad",
-                         {"Param", "Grad", "Moment", "LearningRate"},
-                         {"decay", "epsilon"},
-                         {"ParamOut", "MomentOut"});
-}
-
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(decayed_adagrad,
-                           phi::DecayedAdagradOpArgumentMapping);
diff --git a/paddle/fluid/operators/ops_signature/fusion_group_sig.cc b/paddle/fluid/operators/ops_signature/fusion_group_sig.cc
deleted file mode 100644
index 666e6f77d218f..0000000000000
--- a/paddle/fluid/operators/ops_signature/fusion_group_sig.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature FusionGroupOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature("fusion_group",
-                         {"Inputs"},
-                         {"outs_dtype", "inputs_dtype", "func_name", "type"},
-                         {"Outs"});
-}
-
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(fusion_group, phi::FusionGroupOpArgumentMapping);
diff --git a/paddle/fluid/operators/ops_signature/rrelu_sig.cc b/paddle/fluid/operators/ops_signature/rrelu_sig.cc
deleted file mode 100644
index 18bda743e3255..0000000000000
--- a/paddle/fluid/operators/ops_signature/rrelu_sig.cc
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature RReluOpArgumentMapping(
-    const ArgumentMappingContext& ctx UNUSED) {
-  return KernelSignature(
-      "rrelu", {"X"}, {"lower", "upper", "is_test"}, {"Out", "Noise"});
-}
-
-KernelSignature RReluGradGradOpArgumentMapping(
-    const ArgumentMappingContext& ctx UNUSED) {
-  return KernelSignature(
-      "rrelu_grad", {"X", "Noise", "Out@GRAD"}, {}, {"X@GRAD"});
-}
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(rrelu, phi::RReluOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(rrelu_grad, phi::RReluGradGradOpArgumentMapping);
diff --git a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
deleted file mode 100644
index 23441206a55c1..0000000000000
--- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
+++ /dev/null
@@ -1,141 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-#include "paddle/phi/infermeta/multiary.h"
-
-namespace paddle {
-namespace operators {
-
-class DecayedAdagradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(
-        ctx->HasInput("Param"), "Input", "Param", "DecayedAdagradOp");
-    OP_INOUT_CHECK(ctx->HasInput("Grad"), "Input", "Grad", "DecayedAdagradOp");
-    OP_INOUT_CHECK(
-        ctx->HasInput("Moment"), "Input", "Moment", "DecayedAdagradOp");
-    OP_INOUT_CHECK(ctx->HasInput("LearningRate"),
-                   "Input",
-                   "LearningRate",
-                   "DecayedAdagradOp");
-    PADDLE_ENFORCE_EQ(ctx->GetInputsVarType("Param").front(),
-                      framework::proto::VarType::LOD_TENSOR,
-                      phi::errors::InvalidArgument(
-                          "The input var's type should be phi::DenseTensor, "
-                          "but the received is %s",
-                          ctx->Inputs("Param").front(),
-                          ctx->GetInputsVarType("Param").front()));
-    PADDLE_ENFORCE_EQ(ctx->GetInputsVarType("Grad").front(),
-                      framework::proto::VarType::LOD_TENSOR,
-                      phi::errors::InvalidArgument(
-                          "The input var's type should be phi::DenseTensor, "
-                          "but the received is %s",
-                          ctx->Inputs("Grad").front(),
-                          ctx->GetInputsVarType("Grad").front()));
-
-    OP_INOUT_CHECK(
-        ctx->HasOutput("ParamOut"), "Output", "ParamOut", "DecayedAdagradOp");
-    OP_INOUT_CHECK(
-        ctx->HasOutput("MomentOut"), "Output", "MomentOut", "DecayedAdagradOp");
-
-    auto lr_dims = ctx->GetInputDim("LearningRate");
-    PADDLE_ENFORCE_NE(common::product(lr_dims),
-                      0,
-                      phi::errors::InvalidArgument(
-                          "Maybe the Input variable LearningRate has not "
-                          "been initialized. You may need to confirm "
-                          "if you put exe.run(startup_program) "
-                          "after optimizer.minimize function."));
-    PADDLE_ENFORCE_EQ(
-        common::product(lr_dims),
-        1,
-        phi::errors::InvalidArgument("LearningRate should have one element"));
-    auto param_dims = ctx->GetInputDim("Param");
-    PADDLE_ENFORCE_EQ(
-        param_dims,
-        ctx->GetInputDim("Grad"),
-        phi::errors::InvalidArgument(
-            "Param and Grad input of DecayedAdagradOp should have "
-            "the same dimension."));
-    PADDLE_ENFORCE_EQ(
-        param_dims,
-        ctx->GetInputDim("Moment"),
-        phi::errors::InvalidArgument(
-            "Param and Moment input of DecayedAdagradOp should have "
-            "the same dimension."));
-
-    ctx->SetOutputDim("ParamOut", param_dims);
-    ctx->SetOutputDim("MomentOut", param_dims);
-  }
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "Param"),
-                          ctx.GetPlace());
-  }
-};
-
-class DecayedAdagradOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Param", "(Tensor) Input parameter");
-    AddInput("Grad", "(Tensor) Input gradient");
-    AddInput("Moment", "(Tensor) Second moment");
-    AddInput("LearningRate", "(Tensor) Learning rate");
-
-    AddOutput("ParamOut", "(Tensor) Output parameter");
-    AddOutput("MomentOut", "(Tensor) Output second moment");
-
-    AddAttr<float>("decay",
-                   "(float, default 0.95) "
-                   "Discounting factor for coming gradient")
-        .SetDefault(0.95);
-    AddAttr<float>("epsilon",
-                   "(float, default 1.0e-6) "
-                   "Constant for numerical stability")
-        .SetDefault(1.0e-6f);
-    AddComment(R"DOC(
-Decayed Adagrad Optimizer.
-
-The update is done as follows:
-
-$$
-moment\_out = decay * moment + (1 - decay) * grad * grad \\
-param\_out = param - \frac{learning\_rate * grad}{\sqrt{moment\_out} + epsilon}
-$$
-
-The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
-does not have an epsilon attribute. It is added here for numerical
-stability to avoid the division by zero error.
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-DECLARE_INFER_SHAPE_FUNCTOR(decayed_adagrad,
-                            DecayedAdagradShapeFunctor,
-                            PD_INFER_META(phi::DecayedAdagradInferMeta));
-
-REGISTER_OP_WITHOUT_GRADIENT(decayed_adagrad,
-                             ops::DecayedAdagradOp,
-                             ops::DecayedAdagradOpMaker,
-                             DecayedAdagradShapeFunctor);
diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op.cc b/paddle/fluid/operators/reduce_ops/logsumexp_op.cc
index 1997d1fb99fd2..73ad94c0a5c6a 100644
--- a/paddle/fluid/operators/reduce_ops/logsumexp_op.cc
+++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.cc
@@ -17,7 +17,7 @@
 #include <vector>
 
 #include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op_function.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/unary.h"
 
diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op_xpu.cc b/paddle/fluid/operators/reduce_ops/logsumexp_op_xpu.cc
index 2ed2e3278acad..fce12ae865173 100644
--- a/paddle/fluid/operators/reduce_ops/logsumexp_op_xpu.cc
+++ b/paddle/fluid/operators/reduce_ops/logsumexp_op_xpu.cc
@@ -14,7 +14,7 @@
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/reduce_ops/reduce_op_function.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/backends/xpu/xpu_header.h"
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
index ba4f188274d18..464a8e547e508 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
@@ -12,19 +12,215 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h"
-
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
 
 #include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/unary.h"
 
+namespace ops = paddle::operators;
 namespace paddle {
 namespace operators {
+class ReduceBaseOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ReduceBaseOp");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "ReduceBaseOp");
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_rank = x_dims.size();
+    auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
+    PADDLE_ENFORCE_GT(dims.size(),
+                      0,
+                      phi::errors::InvalidArgument(
+                          "The input dim dimensions of ReduceBaseOp "
+                          "should be greater than 0. But received the dim "
+                          "dimensions of Reduce = %d.",
+                          dims.size()));
+
+    for (size_t i = 0; i < dims.size(); ++i) {
+      PADDLE_ENFORCE_LT(
+          dims[i],
+          x_rank,
+          phi::errors::InvalidArgument(
+              "The reduce dim index %d should be in the "
+              "range [-dimension(X), dimension(X)] "
+              "which dimension = %d. But received dim index = %d.",
+              i,
+              x_rank,
+              dims[i]));
+      PADDLE_ENFORCE_GE(
+          dims[i],
+          -x_rank,
+          phi::errors::InvalidArgument(
+              "The reduce dim index %d should be in the "
+              "range [-dimension(X), dimension(X)] "
+              "which dimension = %d. But received dim index = %d.",
+              i,
+              x_rank,
+              dims[i]));
+      if (dims[i] < 0) dims[i] = x_rank + dims[i];
+    }
+    sort(dims.begin(), dims.end());
+    bool reduce_all = ctx->Attrs().Get<bool>("reduce_all");
+    bool keep_dim = ctx->Attrs().Get<bool>("keep_dim");
+    if (reduce_all) {
+      if (keep_dim)
+        ctx->SetOutputDim("Out",
+                          common::make_ddim(std::vector<int64_t>(x_rank, 1)));
+      else
+        ctx->SetOutputDim("Out", {1});
+    } else {
+      auto dims_vector = common::vectorize(x_dims);
+      if (keep_dim) {
+        for (size_t i = 0; i < dims.size(); ++i) {
+          dims_vector[dims[i]] = 1;
+        }
+      } else {
+        const int kDelFlag = -2;
+        for (size_t i = 0; i < dims.size(); ++i) {
+          dims_vector[dims[i]] = kDelFlag;
+        }
+        dims_vector.erase(
+            remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
+            dims_vector.end());
+      }
+      if (!keep_dim && dims_vector.size() == 0) {
+        dims_vector.push_back(1);
+      }
+      auto out_dims = common::make_ddim(dims_vector);
+      ctx->SetOutputDim("Out", out_dims);
+      if (dims.size() > 0 && dims[0] != 0) {
+        // Only pass LoD when not reducing on the first dim.
+        ctx->ShareLoD("X", /*->*/ "Out");
+      }
+    }
+  }
+
+  // oneDNN's reduction kernel is optimized only for reducing throughout the
+  // most outer dims, so in case of another type of reduction, it would be
+  // better to fallback to native implementation
+  static bool HasOptimizedOneDNNKernel(const framework::ExecutionContext& ctx) {
+    // native reduce kernels don't support bf16
+    // so oneDNN kernel is enforced in that case
+    if (ctx.Input<phi::DenseTensor>("X")->dtype() == phi::DataType::BFLOAT16)
+      return true;
+
+    if (!ctx.HasAttr("dim") || !ctx.HasAttr("reduce_all")) {
+      return false;
+    }
+
+    auto reduce_dims = ctx.Attr<std::vector<int>>("dim");
+    const bool reduce_all = ctx.Attr<bool>("reduce_all");
+    int ndims = ctx.Input<phi::DenseTensor>("X")->dims().size();
+
+    if (reduce_all) {
+      return true;
+    }
+
+    for (size_t i = 0; i < reduce_dims.size(); ++i) {
+      if (reduce_dims[i] < 0) reduce_dims[i] = ndims + reduce_dims[i];
+    }
+    sort(reduce_dims.begin(), reduce_dims.end());
+    for (size_t i = 0; i < reduce_dims.size(); ++i) {
+      if (reduce_dims[reduce_dims.size() - i - 1] !=
+          static_cast<int>(ndims - i - 1)) {
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+  phi::KernelKey GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    // choose cudnn kernel if the runtime supported.
+    auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+
+    // NOTE(jiahongyu): Below codes originally enclosed by PADDLE_WITH_DNNL
+    if (ctx.Input<phi::DenseTensor>("X")->dims().size() > 5 ||
+        !HasOptimizedOneDNNKernel(ctx)) {
+      this->SetDnnFallback(true);
+    }
+    // NOTE(jiahongyu): Above codes originally enclosed by PADDLE_WITH_DNNL
+
+    if (input_data_type == framework::proto::VarType::FP16) {
+      PADDLE_ENFORCE_EQ(
+          ctx.GetPlace().GetType() == phi::AllocationType::GPU ||
+              ctx.GetPlace().GetType() == phi::AllocationType::XPU ||
+              ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM,
+          true,
+          phi::errors::InvalidArgument(
+              "float16 can only be used on GPU or XPU place"));
+    }
+    return phi::KernelKey(input_data_type, ctx.GetPlace());
+  }
+};
+
+class ReduceGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ReduceBaseOp");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input",
+                   "Out@GRAD",
+                   "ReduceBaseOp");
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_rank = x_dims.size();
+    // TODO(dev): We should delete Infershape and migrate it into
+    // UnchangeInferMeta.In case of 'dim' is Variable, it will
+    // not exist in Attrs but in Inputs.
+    if (ctx->HasAttr("dim")) {
+      auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
+      for (size_t i = 0; i < dims.size(); ++i) {
+        PADDLE_ENFORCE_LT(
+            dims[i],
+            x_rank,
+            phi::errors::InvalidArgument(
+                "The reduce dim index %d should be in the "
+                "range [-dimension(X), dimension(X)], "
+                "which dimension = %d. But received dim index = %d.",
+                i,
+                x_rank,
+                dims[i]));
+        if (dims[i] < 0) dims[i] = x_rank + dims[i];
+      }
+    }
+
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+      ctx->ShareLoD("X", /*->*/ x_grad_name);
+    }
+  }
+
+ protected:
+  phi::KernelKey GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    int out_dtype = ctx.Attr<int>("out_dtype");
+    auto input_data_type =
+        (out_dtype >= 0)
+            ? static_cast<framework::proto::VarType::Type>(out_dtype)
+            : OperatorWithKernel::IndicateVarDataType(
+                  ctx, framework::GradVarName("Out"));
+
+    // NOTE(jiahongyu): Below codes originally enclosed by PADDLE_WITH_DNNL
+    // max 5D tensor is supported
+    if (ctx.Input<phi::DenseTensor>("X")->dims().size() > 5) {
+      dnn_fallback_ = true;
+    }
+    // NOTE(jiahongyu): Above codes originally enclosed by PADDLE_WITH_DNNL
+
+    return phi::KernelKey(input_data_type, ctx.GetPlace());
+  }
+};
 
 // NOTE(dengkaipeng): Input(Out) is unnecessary in reduce_mean_grad
 // calcualtion, but will incur a reduce_mean_grad op after
@@ -65,6 +261,7 @@ class ReduceMeanDoubleGradDescMaker : public framework::GradOpDescMakerBase {
     return ops;
   }
 };
+
 class ReduceMeanDoubleGradOpBaseMaker : public imperative::GradOpBaseMakerBase {
  public:
   using imperative::GradOpBaseMakerBase::GradOpBaseMakerBase;
@@ -89,6 +286,56 @@ class ReduceMeanDoubleGradOpBaseMaker : public imperative::GradOpBaseMakerBase {
   }
 };
 DECLARE_NO_NEED_BUFFER_VARS_INFERER(ReduceMeanGradNoNeedBufferVarInferer, "X");
+
+class ReduceBaseOpMaker : public paddle::framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() final {
+    AddInput("X",
+             "(Tensor) The input tensor. Tensors with rank at most 6 are "
+             "supported.");
+    AddOutput("Out", "(Tensor) The result tensor.");
+    AddAttr<std::vector<int>>(
+        "dim",
+        "(list<int>, default {0}) The dimensions to reduce. "
+        "Must be in the range [-rank(input), rank(input)). "
+        "If `dim[i] < 0`, the dims[i] to reduce is `rank + dims[i]`. "
+        "Note that reducing on the first dim will make the LoD info lost.")
+        .SetDefault({0})
+        .SupportTensor();
+    AddAttr<bool>("keep_dim",
+                  "(bool, default false) "
+                  "If true, retain the reduced dimension with length 1.")
+        .SetDefault(false);
+    AddAttr<bool>("reduce_all",
+                  "(bool, default false) "
+                  "If true, output a scalar reduced along all dimensions.")
+        .SetDefault(false);
+    AddAttr<int>("in_dtype",
+                 "(int, default -1)"
+                 "The dtype of input, default value is -1, the user could not "
+                 "set this value.")
+        .SetDefault(-1);
+    AddAttr<int>(
+        "out_dtype",
+        "(int, default -1)"
+        "The dtype of output, default value is -1, the dtype is same as intput")
+        .SetDefault(-1);
+    AddComment(string::Sprintf(R"DOC(
+%s Operator.
+
+This operator computes the %s of input tensor along the given dimension.
+The result tensor has 1 fewer dimension than the input unless keep_dim is true.
+If reduce_all is true, just reduce along all dimensions and output a scalar.
+
+)DOC",
+                               GetOpType(),
+                               GetName()));
+  }
+
+ protected:
+  virtual std::string GetName() const = 0;
+  virtual std::string GetOpType() const = 0;
+};
 }  // namespace operators
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.h b/paddle/fluid/operators/reduce_ops/reduce_mean_op.h
deleted file mode 100644
index eb82be83ba517..0000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.h
+++ /dev/null
@@ -1,70 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
-
-namespace paddle {
-namespace operators {
-
-struct MeanFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = x->mean(dim);
-  }
-};
-
-struct MeanGradFunctor {
-  template <typename DeviceContext,
-            typename X,
-            typename Y,
-            typename DX,
-            typename DY,
-            typename Dim>
-  void operator()(const DeviceContext& place,
-                  X* x,
-                  Y* y,
-                  DX* dx,
-                  DY* dy,
-                  const Dim& dim,
-                  int size) {
-    dx->device(place) = dy->broadcast(dim) / dx->constant(size);
-  }
-};
-
-// TODO(zengjinle): Should refine the numeric stability of FP16 reduce_mean
-// and reduce_mean_grad later.
-struct FP16MeanGradFunctor {
-  template <typename DeviceContext,
-            typename X,
-            typename Y,
-            typename DX,
-            typename DY,
-            typename Dim>
-  void operator()(const DeviceContext& place,
-                  X* x,
-                  Y* y,
-                  DX* dx,
-                  DY* dy,
-                  const Dim& dim,
-                  int size) {
-    dx->device(place) = (dy->template cast<float>().broadcast(dim) /
-                         dx->template cast<float>().constant(size))
-                            .template cast<phi::dtype::float16>();
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
deleted file mode 100644
index 44a82397dcc07..0000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ /dev/null
@@ -1,895 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <set>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op_function.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-// only can include the headers in paddle/phi/api dirs
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/phi_utils.h"
-#include "paddle/phi/kernels/cpu/reduce.h"
-
-#if defined(__HIPCC__) || defined(__NVCC__) || defined(__xpu__)
-#include "paddle/phi/kernels/gpu/reduce.h"
-#include "paddle/phi/kernels/gpu/reduce_grad.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-#define HANDLE_DIM(NDIM, RDIM)                                   \
-  if (ndim == NDIM && rdim == RDIM) {                            \
-    paddle::operators::                                          \
-        ReduceFunctor<DeviceContext, OutT, NDIM, RDIM, Functor>( \
-            context.template device_context<DeviceContext>(),    \
-            *input,                                              \
-            output,                                              \
-            dims,                                                \
-            keep_dim);                                           \
-  }
-
-using DDim = phi::DDim;
-
-inline void GetShuffledDim(const DDim& src_dims,
-                           DDim* dst_dims,
-                           const std::vector<int>& reduced_dims,
-                           std::vector<int>* perm_axis) {
-  // check if it's a reduced dim
-  std::vector<bool> src_dims_check(src_dims.size(), false);
-  size_t src_size = src_dims.size();
-  size_t reduce_size = reduced_dims.size();
-  for (size_t i = 0; i < reduce_size; ++i) {
-    dst_dims->at(src_size - reduce_size + i) = src_dims[reduced_dims[i]];
-    (*perm_axis)[src_size - reduce_size + i] = reduced_dims[i];
-    src_dims_check[reduced_dims[i]] = true;
-  }
-
-  size_t offset = 0;
-  for (size_t i = 0; i < src_dims_check.size(); ++i) {
-    bool is_reduced = src_dims_check[i];
-    if (!is_reduced) {
-      (*perm_axis)[offset] = i;
-      dst_dims->at(offset++) = src_dims[i];
-    }
-  }
-}
-
-static inline std::vector<int> GetReduceDim(const std::vector<int>& dims,
-                                            int dim_size,
-                                            bool reduce_all) {
-  std::vector<int> reduce_dims;
-  if (reduce_all) {
-    reduce_dims.resize(dim_size);
-    int reduce_size = reduce_dims.size();
-    for (int i = 0; i < reduce_size; ++i) {
-      reduce_dims[i] = i;
-    }
-  } else {
-    for (auto e : dims) {
-      PADDLE_ENFORCE_LT(e,
-                        dim_size,
-                        phi::errors::InvalidArgument(
-                            "ReduceBaseOp: invalid axis, when x_dims is %d, "
-                            "axis[i] should less than x_dims, but got %d.",
-                            dim_size,
-                            e));
-      reduce_dims.push_back(e >= 0 ? e : e + dim_size);
-    }
-  }
-  return reduce_dims;
-}
-template <typename DeviceContext, typename OutT>
-void GetShuffledInput(const framework::ExecutionContext& context,
-                      const phi::DenseTensor* input,
-                      phi::DenseTensor* shuffled_input,
-                      const std::vector<int>& dims) {
-  DDim shuffled_dims(input->dims());
-  std::vector<int> perm_axis(input->dims().size());
-  GetShuffledDim(input->dims(), &shuffled_dims, dims, &perm_axis);
-
-  shuffled_input->Resize(shuffled_dims);
-  shuffled_input->mutable_data<OutT>(context.GetPlace());
-
-  phi::funcs::TransposeNormal<DeviceContext, OutT> trans;
-  trans(context.template device_context<DeviceContext>(),
-        *input,
-        shuffled_input,
-        perm_axis);
-}
-
-inline void GetOriginDimFromShuffled(const DDim& src_dim,
-                                     const std::vector<int>& dims,
-                                     std::vector<int>* origin_dim) {
-  DDim shuffled_dims(src_dim);
-  size_t n = src_dim.size();
-  std::vector<int> perm_axis(n);
-  GetShuffledDim(src_dim, &shuffled_dims, dims, &perm_axis);
-  for (size_t i = 0; i < n; ++i) {
-    (*origin_dim)[perm_axis[i]] = i;
-  }
-}
-
-template <typename DeviceContext, typename OutT, typename Functor>
-void HandleLargeDim(const framework::ExecutionContext& context,
-                    const phi::DenseTensor* input,
-                    phi::DenseTensor* output,
-                    const std::vector<int>& dims,
-                    bool keep_dim) {
-  //  shuffle the reduced dim to the end
-  phi::DenseTensor shuffled_input;
-  GetShuffledInput<DeviceContext, OutT>(context, input, &shuffled_input, dims);
-
-  // transpose to 2D tensor whose shape is {unreduced, reduced}.
-  const int64_t unreduced = output->numel();
-  const int64_t input_numel = shuffled_input.numel();
-  // assume: 0 / 0 == 0, which allow process 0 dim tensor
-  const int64_t reduced = (unreduced != 0) ? (input_numel / unreduced) : 0;
-
-  PADDLE_ENFORCE_EQ(
-      unreduced * reduced,
-      input_numel,
-      phi::errors::InvalidArgument(
-          "Reducing failed in HandleLargeDim, when try to transpose (%d) "
-          "operands into 2D tensor with shape (%d, %d).",
-          input_numel,
-          unreduced,
-          reduced));
-
-  shuffled_input.Resize({unreduced, reduced});
-
-  DDim output_dim = output->dims();
-  output->Resize({unreduced});
-  paddle::operators::ReduceFunctor<DeviceContext, OutT, 2, 1, Functor>(
-      context.template device_context<DeviceContext>(),
-      shuffled_input,
-      output,
-      {1},
-      keep_dim);
-  output->Resize(output_dim);
-}
-
-template <typename DeviceContext, typename T, typename Functor>
-void HandleLargeDimGrad(const framework::ExecutionContext& context,
-                        const phi::DenseTensor* x,
-                        const phi::DenseTensor* out,
-                        const phi::DenseTensor* dout,
-                        phi::DenseTensor* dx,
-                        Functor functor,
-                        const std::vector<int>& dims) {
-  const int64_t unreduced = out->numel();
-  const int64_t x_numel = x->numel();
-  // assume: 0 / 0 == 0, which allow process 0 dim tensor
-  const int64_t reduced = (unreduced != 0) ? (x_numel / unreduced) : 0;
-
-  PADDLE_ENFORCE_EQ(
-      unreduced * reduced,
-      x_numel,
-      phi::errors::InvalidArgument(
-          "Reducing failed in HandleLargeDimGrad, when try to transpose (%d) "
-          "operands into 2D tensor with shape (%d, %d).",
-          x_numel,
-          unreduced,
-          reduced));
-
-  DDim out_dim(out->dims());
-  DDim x_dim(x->dims());
-  // transpose and reshape X
-  phi::DenseTensor shuffled_x;
-  GetShuffledInput<DeviceContext, T>(context, x, &shuffled_x, dims);
-  DDim shuffled_dim = shuffled_x.dims();
-  shuffled_x.Resize({unreduced, reduced});
-  // reshape dX {unreduced, reduced}
-  dx->Resize({unreduced, reduced});
-  ReduceGradFunctor<DeviceContext, T, 2, Functor>(
-      context.template device_context<DeviceContext>(),
-      shuffled_x,
-      *out,
-      *dout,
-      dx,
-      functor,
-      {1});
-  // transpose dX
-  std::vector<int> origin_axis(x_dim.size());
-  GetOriginDimFromShuffled(x_dim, dims, &origin_axis);
-  phi::DenseTensor dx_tmp;
-  framework::TensorCopy(*dx, context.GetPlace(), &dx_tmp);
-  dx_tmp.Resize(shuffled_dim);
-  dx->Resize(x_dim);
-  phi::funcs::TransposeNormal<DeviceContext, T> trans;
-  trans(context.template device_context<DeviceContext>(),
-        dx_tmp,
-        dx,
-        origin_axis);
-}
-
-template <typename DeviceContext, typename T, typename Functor>
-struct ReduceKernelFunctor {
-  const phi::DenseTensor* input;
-  phi::DenseTensor* output;
-  std::vector<int> dims;
-  bool keep_dim;
-  bool reduce_all;
-  const framework::ExecutionContext& context;
-  ReduceKernelFunctor(const phi::DenseTensor* input,
-                      phi::DenseTensor* output,
-                      const std::vector<int>& dims,
-                      bool keep_dim,
-                      bool reduce_all,
-                      const framework::ExecutionContext& context)
-      : input(input),
-        output(output),
-        dims(dims),
-        keep_dim(keep_dim),
-        reduce_all(reduce_all),
-        context(context) {}
-
-  template <typename OutT>
-  void apply() const {
-    output->mutable_data<OutT>(context.GetPlace());
-    if (reduce_all) {
-      // Flatten and reduce 1-D tensor
-      auto x = EigenVector<OutT>::Flatten(*input);
-      auto out = EigenScalar<OutT>::From(*output);
-      auto& place =
-          *context.template device_context<DeviceContext>().eigen_device();
-      auto reduce_dim = Eigen::array<int, 1>({{0}});
-      Functor functor;
-      functor(place, &x, &out, reduce_dim);
-    } else {
-      int ndim = input->dims().size();
-      int rdim = dims.size();
-      if (ndim > 6) {
-        HandleLargeDim<DeviceContext, OutT, Functor>(
-            context, input, output, dims, keep_dim);
-      } else {
-        HANDLE_DIM(6, 5);
-        HANDLE_DIM(6, 4);
-        HANDLE_DIM(6, 3);
-        HANDLE_DIM(6, 2);
-        HANDLE_DIM(6, 1);
-        HANDLE_DIM(5, 4);
-        HANDLE_DIM(5, 3);
-        HANDLE_DIM(5, 2);
-        HANDLE_DIM(5, 1);
-        HANDLE_DIM(4, 3);
-        HANDLE_DIM(4, 2);
-        HANDLE_DIM(4, 1);
-        HANDLE_DIM(3, 2);
-        HANDLE_DIM(3, 1);
-        HANDLE_DIM(2, 1);
-        HANDLE_DIM(1, 1);
-      }
-    }
-  }
-};
-template <typename DeviceContext, typename T, typename Functor>
-class ReduceKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    bool reduce_all = context.Attr<bool>("reduce_all");
-    auto* output = context.Output<phi::DenseTensor>("Out");
-    auto dims = context.Attr<std::vector<int>>("dim");
-    bool keep_dim = context.Attr<bool>("keep_dim");
-    int out_dtype = context.Attr<int>("out_dtype");
-    framework::proto::VarType::Type cast_out_dtype;
-    auto* input = context.Input<phi::DenseTensor>("X");
-
-    if (out_dtype < 0) {
-      cast_out_dtype = static_cast<framework::proto::VarType::Type>(
-          framework::TransToProtoVarType(input->dtype()));
-    } else {
-      cast_out_dtype = static_cast<framework::proto::VarType::Type>(out_dtype);
-    }
-
-    auto& dev_ctx = context.device_context<DeviceContext>();
-    output->mutable_data(
-        dev_ctx.GetPlace(),
-        static_cast<framework::proto::VarType::Type>(cast_out_dtype));
-
-    std::vector<int64_t> tmp_dims(dims.begin(), dims.end());
-
-    // call new kernel
-    phi::Reduce<typename framework::ConvertToPhiContext<DeviceContext>::TYPE,
-                T,
-                Functor>(
-        static_cast<const typename framework::ConvertToPhiContext<
-            DeviceContext>::TYPE&>(dev_ctx),
-        *input,
-        reduce_all,
-        tmp_dims,
-        keep_dim,
-        framework::TransToPhiDataType(cast_out_dtype),
-        output);
-  }
-};
-
-template <typename DeviceContext, typename T, typename Functor>
-void LaunchReduceGradKernel(const framework::ExecutionContext& context,
-                            const phi::DenseTensor* input0,
-                            const phi::DenseTensor* input1,
-                            const phi::DenseTensor* input2,
-                            phi::DenseTensor* output,
-                            Functor functor,
-                            const std::vector<int>& dims,
-                            bool reduce_all = false) {
-  if (reduce_all) {
-    auto x = EigenVector<T>::Flatten(*input0);
-    auto x_reduce = EigenVector<T>::Flatten(*input1);
-    auto x_reduce_grad = EigenVector<T>::Flatten(*input2);
-    auto x_grad = EigenVector<T>::Flatten(*output);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    auto broadcast_dim =
-        Eigen::array<int, 1>({{static_cast<int>(input0->numel())}});
-    functor(place,
-            &x,
-            &x_reduce,
-            &x_grad,
-            &x_reduce_grad,
-            broadcast_dim,
-            broadcast_dim[0]);
-  } else {
-    int rank = input0->dims().size();
-    switch (rank) {
-      case 1:
-        ReduceGradFunctor<DeviceContext, T, 1, Functor>(
-            context.template device_context<DeviceContext>(),
-            *input0,
-            *input1,
-            *input2,
-            output,
-            functor,
-            dims);
-        break;
-      case 2:
-        ReduceGradFunctor<DeviceContext, T, 2, Functor>(
-            context.template device_context<DeviceContext>(),
-            *input0,
-            *input1,
-            *input2,
-            output,
-            functor,
-            dims);
-        break;
-      case 3:
-        ReduceGradFunctor<DeviceContext, T, 3, Functor>(
-            context.template device_context<DeviceContext>(),
-            *input0,
-            *input1,
-            *input2,
-            output,
-            functor,
-            dims);
-        break;
-      case 4:
-        ReduceGradFunctor<DeviceContext, T, 4, Functor>(
-            context.template device_context<DeviceContext>(),
-            *input0,
-            *input1,
-            *input2,
-            output,
-            functor,
-            dims);
-        break;
-      case 5:
-        ReduceGradFunctor<DeviceContext, T, 5, Functor>(
-            context.template device_context<DeviceContext>(),
-            *input0,
-            *input1,
-            *input2,
-            output,
-            functor,
-            dims);
-        break;
-      case 6:
-        ReduceGradFunctor<DeviceContext, T, 6, Functor>(
-            context.template device_context<DeviceContext>(),
-            *input0,
-            *input1,
-            *input2,
-            output,
-            functor,
-            dims);
-        break;
-      default:
-        HandleLargeDimGrad<DeviceContext, T, Functor>(
-            context, input0, input1, input2, output, functor, dims);
-        break;
-    }
-  }
-}
-
-template <typename DeviceContext,
-          typename T,
-          typename Functor,
-          bool kNoNeedBufferX = false,
-          bool kNoNeedBufferY = false>
-class ReduceGradKernel : public framework::OpKernel<T> {
- public:
-  void ComputeFromInput(const phi::DenseTensor* input2,
-                        const framework::ExecutionContext& context) const {
-    bool reduce_all = context.Attr<bool>("reduce_all");
-    auto dims = context.Attr<std::vector<int>>("dim");
-    auto* input0 = context.Input<phi::DenseTensor>("X");
-    auto* input1 = context.Input<phi::DenseTensor>("Out");
-
-    auto* output =
-        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    output->mutable_data<T>(context.GetPlace());
-
-    // The dims has full dim, set the reduce_all is True
-    const auto& input_dim_size =
-        context.Input<phi::DenseTensor>("X")->dims().size();
-    std::set<int> dims_set(dims.begin(), dims.end());
-    bool full_dim = true;
-    for (auto i = 0; i < input_dim_size; i++) {
-      if (dims_set.find(i) == dims_set.end()) {
-        full_dim = false;
-        break;
-      }
-    }
-    reduce_all = (reduce_all || full_dim);
-    // NOTE: EigenTensor::From() uses tensor->data()
-    // if op has NoNeedBufferVarsInferer, the corresponding kNoNeedBufferX or
-    // kNoNeedBufferY should set true
-    // and use fake var that has same dims.
-    if (kNoNeedBufferX) {
-      input0 = output;
-    }
-    if (kNoNeedBufferY) {
-      input1 = input2;
-    }
-
-    const std::vector<int> const_dims = dims;
-
-    // NOTE(dengkaipeng): Out is unnecessary in some reduce kernel and
-    // not be set as Input in grad Maker, use Out_grad to replace here
-    if (!input1) input1 = input2;
-    Functor functor;
-    LaunchReduceGradKernel<DeviceContext, T, Functor>(context,
-                                                      input0,
-                                                      input1,
-                                                      input2,
-                                                      output,
-                                                      functor,
-                                                      const_dims,
-                                                      reduce_all);
-  }
-
-  void Compute(const framework::ExecutionContext& context) const override {
-    int in_dtype = context.Attr<int>("in_dtype");
-    if (in_dtype >= 0) {
-      phi::DenseTensor tmp_tensor;
-      auto* pre_input =
-          context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-      auto in_kernel_type =
-          phi::KernelKey(framework::TransToProtoVarType(pre_input->dtype()),
-                         context.GetPlace());
-      auto out_kernel_type =
-          phi::KernelKey(static_cast<framework::proto::VarType::Type>(in_dtype),
-                         context.GetPlace());
-      framework::TransDataType(
-          in_kernel_type, out_kernel_type, *pre_input, &tmp_tensor);
-      ComputeFromInput(&tmp_tensor, context);
-
-    } else {
-      auto* input2 =
-          context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-      ComputeFromInput(input2, context);
-    }
-  }
-};
-
-class ReduceBaseOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ReduceBaseOp");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "ReduceBaseOp");
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_rank = x_dims.size();
-    auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
-    PADDLE_ENFORCE_GT(dims.size(),
-                      0,
-                      phi::errors::InvalidArgument(
-                          "The input dim dimensions of ReduceBaseOp "
-                          "should be greater than 0. But received the dim "
-                          "dimensions of Reduce = %d.",
-                          dims.size()));
-
-    for (size_t i = 0; i < dims.size(); ++i) {
-      PADDLE_ENFORCE_LT(
-          dims[i],
-          x_rank,
-          phi::errors::InvalidArgument(
-              "The reduce dim index %d should be in the "
-              "range [-dimension(X), dimension(X)] "
-              "which dimension = %d. But received dim index = %d.",
-              i,
-              x_rank,
-              dims[i]));
-      PADDLE_ENFORCE_GE(
-          dims[i],
-          -x_rank,
-          phi::errors::InvalidArgument(
-              "The reduce dim index %d should be in the "
-              "range [-dimension(X), dimension(X)] "
-              "which dimension = %d. But received dim index = %d.",
-              i,
-              x_rank,
-              dims[i]));
-      if (dims[i] < 0) dims[i] = x_rank + dims[i];
-    }
-    sort(dims.begin(), dims.end());
-    bool reduce_all = ctx->Attrs().Get<bool>("reduce_all");
-    bool keep_dim = ctx->Attrs().Get<bool>("keep_dim");
-    if (reduce_all) {
-      if (keep_dim)
-        ctx->SetOutputDim("Out",
-                          common::make_ddim(std::vector<int64_t>(x_rank, 1)));
-      else
-        ctx->SetOutputDim("Out", {1});
-    } else {
-      auto dims_vector = common::vectorize(x_dims);
-      if (keep_dim) {
-        for (size_t i = 0; i < dims.size(); ++i) {
-          dims_vector[dims[i]] = 1;
-        }
-      } else {
-        const int kDelFlag = -2;
-        for (size_t i = 0; i < dims.size(); ++i) {
-          dims_vector[dims[i]] = kDelFlag;
-        }
-        dims_vector.erase(
-            remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
-            dims_vector.end());
-      }
-      if (!keep_dim && dims_vector.size() == 0) {
-        dims_vector.push_back(1);
-      }
-      auto out_dims = common::make_ddim(dims_vector);
-      ctx->SetOutputDim("Out", out_dims);
-      if (dims.size() > 0 && dims[0] != 0) {
-        // Only pass LoD when not reducing on the first dim.
-        ctx->ShareLoD("X", /*->*/ "Out");
-      }
-    }
-  }
-
-  // oneDNN's reduction kernel is optimized only for reducing throughout the
-  // most outer dims, so in case of another type of reduction, it would be
-  // better to fallback to native implementation
-  static bool HasOptimizedOneDNNKernel(const framework::ExecutionContext& ctx) {
-    // native reduce kernels don't support bf16
-    // so oneDNN kernel is enforced in that case
-    if (ctx.Input<phi::DenseTensor>("X")->dtype() == phi::DataType::BFLOAT16)
-      return true;
-
-    if (!ctx.HasAttr("dim") || !ctx.HasAttr("reduce_all")) {
-      return false;
-    }
-
-    auto reduce_dims = ctx.Attr<std::vector<int>>("dim");
-    const bool reduce_all = ctx.Attr<bool>("reduce_all");
-    int ndims = ctx.Input<phi::DenseTensor>("X")->dims().size();
-
-    if (reduce_all) {
-      return true;
-    }
-
-    for (size_t i = 0; i < reduce_dims.size(); ++i) {
-      if (reduce_dims[i] < 0) reduce_dims[i] = ndims + reduce_dims[i];
-    }
-    sort(reduce_dims.begin(), reduce_dims.end());
-    for (size_t i = 0; i < reduce_dims.size(); ++i) {
-      if (reduce_dims[reduce_dims.size() - i - 1] !=
-          static_cast<int>(ndims - i - 1)) {
-        return false;
-      }
-    }
-
-    return true;
-  }
-
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    // choose cudnn kernel if the runtime supported.
-    auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
-
-    // NOTE(jiahongyu): Below codes originally enclosed by PADDLE_WITH_DNNL
-    if (ctx.Input<phi::DenseTensor>("X")->dims().size() > 5 ||
-        !HasOptimizedOneDNNKernel(ctx)) {
-      this->SetDnnFallback(true);
-    }
-    // NOTE(jiahongyu): Above codes originally enclosed by PADDLE_WITH_DNNL
-
-    if (input_data_type == framework::proto::VarType::FP16) {
-      PADDLE_ENFORCE_EQ(
-          ctx.GetPlace().GetType() == phi::AllocationType::GPU ||
-              ctx.GetPlace().GetType() == phi::AllocationType::XPU ||
-              ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM,
-          true,
-          phi::errors::InvalidArgument(
-              "float16 can only be used on GPU or XPU place"));
-    }
-    return phi::KernelKey(input_data_type, ctx.GetPlace());
-  }
-};
-
-class ReduceOpUseInputPlace : public ReduceBaseOp {
- public:
-  using ReduceBaseOp::ReduceBaseOp;
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    phi::KernelKey kt = OperatorWithKernel::GetExpectedKernelType(ctx);
-    kt.set_backend(
-        phi::TransToPhiBackend(ctx.Input<phi::DenseTensor>("X")->place()));
-    return kt;
-  }
-};
-
-class ReduceGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ReduceBaseOp");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input",
-                   "Out@GRAD",
-                   "ReduceBaseOp");
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_rank = x_dims.size();
-    // TODO(dev): We should delete Infershape and migrate it into
-    // UnchangeInferMeta.In case of 'dim' is Variable, it will
-    // not exist in Attrs but in Inputs.
-    if (ctx->HasAttr("dim")) {
-      auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
-      for (size_t i = 0; i < dims.size(); ++i) {
-        PADDLE_ENFORCE_LT(
-            dims[i],
-            x_rank,
-            phi::errors::InvalidArgument(
-                "The reduce dim index %d should be in the "
-                "range [-dimension(X), dimension(X)], "
-                "which dimension = %d. But received dim index = %d.",
-                i,
-                x_rank,
-                dims[i]));
-        if (dims[i] < 0) dims[i] = x_rank + dims[i];
-      }
-    }
-
-    auto x_grad_name = framework::GradVarName("X");
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
-      ctx->ShareLoD("X", /*->*/ x_grad_name);
-    }
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    int out_dtype = ctx.Attr<int>("out_dtype");
-    auto input_data_type =
-        (out_dtype >= 0)
-            ? static_cast<framework::proto::VarType::Type>(out_dtype)
-            : OperatorWithKernel::IndicateVarDataType(
-                  ctx, framework::GradVarName("Out"));
-
-    // NOTE(jiahongyu): Below codes originally enclosed by PADDLE_WITH_DNNL
-    // max 5D tensor is supported
-    if (ctx.Input<phi::DenseTensor>("X")->dims().size() > 5) {
-      dnn_fallback_ = true;
-    }
-    // NOTE(jiahongyu): Above codes originally enclosed by PADDLE_WITH_DNNL
-
-    return phi::KernelKey(input_data_type, ctx.GetPlace());
-  }
-};
-
-class ReduceBaseOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() final {
-    AddInput("X",
-             "(Tensor) The input tensor. Tensors with rank at most 6 are "
-             "supported.");
-    AddOutput("Out", "(Tensor) The result tensor.");
-    AddAttr<std::vector<int>>(
-        "dim",
-        "(list<int>, default {0}) The dimensions to reduce. "
-        "Must be in the range [-rank(input), rank(input)). "
-        "If `dim[i] < 0`, the dims[i] to reduce is `rank + dims[i]`. "
-        "Note that reducing on the first dim will make the LoD info lost.")
-        .SetDefault({0})
-        .SupportTensor();
-    AddAttr<bool>("keep_dim",
-                  "(bool, default false) "
-                  "If true, retain the reduced dimension with length 1.")
-        .SetDefault(false);
-    AddAttr<bool>("reduce_all",
-                  "(bool, default false) "
-                  "If true, output a scalar reduced along all dimensions.")
-        .SetDefault(false);
-    AddAttr<int>("in_dtype",
-                 "(int, default -1)"
-                 "The dtype of input, default value is -1, the user could not "
-                 "set this value.")
-        .SetDefault(-1);
-    AddAttr<int>(
-        "out_dtype",
-        "(int, default -1)"
-        "The dtype of output, default value is -1, the dtype is same as intput")
-        .SetDefault(-1);
-    AddComment(string::Sprintf(R"DOC(
-%s Operator.
-
-This operator computes the %s of input tensor along the given dimension.
-The result tensor has 1 fewer dimension than the input unless keep_dim is true.
-If reduce_all is true, just reduce along all dimensions and output a scalar.
-
-)DOC",
-                               GetOpType(),
-                               GetName()));
-  }
-
- protected:
-  virtual std::string GetName() const = 0;
-  virtual std::string GetOpType() const = 0;
-};
-
-#if defined(__HIPCC__) || defined(__NVCC__) || defined(__xpu__)
-template <typename T,
-          template <typename>
-          class ReduceBaseOp,
-          template <typename, typename>
-          class TransformOp>
-class ReduceCudaKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    bool reduce_all = context.Attr<bool>("reduce_all");
-    const phi::DenseTensor* input = context.Input<phi::DenseTensor>("X");
-    phi::DenseTensor* output = context.Output<phi::DenseTensor>("Out");
-    auto out_dtype = context.Attr<int>("out_dtype");
-    auto pt_out_dtype = paddle::framework::TransToPhiDataType(
-        static_cast<framework::proto::VarType::Type>(out_dtype));
-    std::vector<int> dims = context.Attr<std::vector<int>>("dim");
-#ifdef PADDLE_WITH_XPU_KP
-    auto& dev_ctx = context.template device_context<phi::XPUContext>();
-#else
-    auto& dev_ctx = context.cuda_device_context();
-#endif
-    if (out_dtype >= 0) {
-      output->mutable_data(dev_ctx.GetPlace(), pt_out_dtype);
-    } else {
-      output->mutable_data(dev_ctx.GetPlace(), input->dtype());
-    }
-
-    std::vector<int64_t> dims_int64{dims.begin(), dims.end()};
-
-    phi::Reduce<T, ReduceBaseOp, TransformOp>(
-        dev_ctx, *input, reduce_all, dims_int64, false, pt_out_dtype, output);
-  }
-};
-
-#ifndef PADDLE_WITH_XPU_KP
-template <typename T, template <typename, typename> class TransformOp>
-class ReduceCudaGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    bool reduce_all = context.Attr<bool>("reduce_all");
-    std::vector<int> dims = context.Attr<std::vector<int>>("dim");
-    auto* in_x = context.Input<phi::DenseTensor>("X");
-
-    auto* d_out =
-        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* d_x = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto out_dtype = context.Attr<int>("in_dtype");
-    auto pt_out_dtype = framework::TransToPhiDataType(
-        static_cast<framework::proto::VarType::Type>(out_dtype));
-    // get reduce_dim and reduce_num for reduce_mean_grad
-    int dim_size = in_x->dims().size();
-    std::vector<int> reduce_dims = GetReduceDim(dims, dim_size, reduce_all);
-    auto update_dims = common::vectorize(d_x->dims());
-    int reduce_num = 1;
-    for (auto i : reduce_dims) {
-      reduce_num *= (in_x->dims())[i];
-      update_dims[i] = 1;
-    }
-    // make new tensor
-    phi::DenseTensor new_d_out(d_out->type());
-    new_d_out.ShareDataWith(*d_out);
-    new_d_out.Resize(common::make_ddim(update_dims));
-    auto& dev_ctx = context.cuda_device_context();
-    if (out_dtype > 0) {
-      d_x->mutable_data(dev_ctx.GetPlace(), pt_out_dtype);
-    } else {
-      d_x->mutable_data(dev_ctx.GetPlace(), d_out->dtype());
-    }
-    auto pt_d_out = std::make_unique<phi::DenseTensor>(new_d_out);
-    auto pt_d_x = std::make_unique<phi::DenseTensor>(*d_x);
-    if (out_dtype <= 0) {
-      pt_out_dtype = d_out->dtype();
-    }
-
-    using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-    phi::ReduceGrad<TransformOp<T, MPType>>(dev_ctx,
-                                            pt_d_out.get(),
-                                            pt_d_x.get(),
-                                            pt_out_dtype,
-                                            TransformOp<T, MPType>(reduce_num));
-  }
-};
-
-template <typename T>
-struct EqualFunctor {
-  inline T initial() { return static_cast<T>(0.0f); }
-
-  inline HOSTDEVICE T operator()(const T a, const T b) const {
-    return static_cast<T>(a == b);
-  }
-};
-
-template <typename T, typename Enable = void>
-struct DivideFunctor {
-  inline T initial() { return static_cast<T>(1.0f); }
-
-  inline HOSTDEVICE T operator()(const T a, const T b) const { return a / b; }
-};
-#endif
-#endif
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-#define REGISTER_REDUCE_OP(op_name)                                           \
-  class __##op_name##Maker__ : public ops::ReduceBaseOpMaker {                \
-   protected:                                                                 \
-    virtual std::string GetName() const { return #op_name; }                  \
-    virtual std::string GetOpType() const { return "Reduce " #op_name; }      \
-  };                                                                          \
-  REGISTER_OPERATOR(                                                          \
-      op_name,                                                                \
-      ops::ReduceBaseOp,                                                      \
-      __##op_name##Maker__,                                                   \
-      paddle::framework::DefaultGradOpMaker<paddle::framework::OpDesc, true>, \
-      paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase,       \
-                                            true>);                           \
-  REGISTER_OPERATOR(op_name##_grad, ops::ReduceGradOp)
-
-#define REGISTER_REDUCE_OP_WITHOUT_GRAD(op_name, ...)                    \
-  class __##op_name##Maker__ : public ops::ReduceBaseOpMaker {           \
-   protected:                                                            \
-    virtual std::string GetName() const { return #op_name; }             \
-    virtual std::string GetOpType() const { return "Reduce " #op_name; } \
-  };                                                                     \
-  REGISTER_OPERATOR(                                                     \
-      op_name,                                                           \
-      ops::ReduceBaseOp##__VA_ARGS__,                                    \
-      __##op_name##Maker__,                                              \
-      paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,    \
-      paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op_function.h b/paddle/fluid/operators/reduce_ops/reduce_op_function.h
deleted file mode 100644
index b8043dcd94ba0..0000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_op_function.h
+++ /dev/null
@@ -1,123 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/eigen/common.h"
-
-namespace paddle {
-namespace operators {
-
-using DDim = phi::DDim;
-template <typename T,
-          size_t D,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = phi::EigenTensor<T, D, MajorType, IndexType>;
-template <typename T,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenScalar = phi::EigenScalar<T, MajorType, IndexType>;
-template <typename T,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = phi::EigenVector<T, MajorType, IndexType>;
-
-template <typename DeviceContext,
-          typename T,
-          size_t D,
-          size_t R_D,
-          typename Functor>
-void ReduceFunctor(const DeviceContext& context,
-                   const phi::DenseTensor& input,
-                   phi::DenseTensor* output,
-                   const std::vector<int>& dims,
-                   bool keep_dim) {
-  auto x = EigenTensor<T, D>::From(input);
-  auto x_rank = static_cast<int>(x.dimensions().size());
-  auto reduce_dim = Eigen::array<int, R_D>();
-  std::vector<int> dims_ref = dims;
-  for (size_t i = 0; i < dims_ref.size(); ++i) {
-    if (dims_ref[i] < 0) dims_ref[i] = x_rank + dims_ref[i];
-    reduce_dim[i] = dims_ref[i];
-  }
-  // construct the squeezed output tensor
-  DDim out_dims = output->dims();
-  if (keep_dim && x_rank > 1) {
-    const int kDelFlag = -2;
-    auto dims_vector = common::vectorize(out_dims);
-    for (size_t i = 0; i < dims_ref.size(); ++i) {
-      dims_vector[dims_ref[i]] = kDelFlag;
-    }
-    dims_vector.erase(remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
-                      dims_vector.end());
-    out_dims = common::make_ddim(dims_vector);
-  }
-  auto& place = *context.eigen_device();
-  Functor functor;
-
-  if (D == 1) {
-    auto out = EigenScalar<T>::From(*output);
-    functor(place, &x, &out, reduce_dim);
-  } else {
-    auto out = EigenTensor<T, (D - R_D)>::From(*output, out_dims);
-    functor(place, &x, &out, reduce_dim);
-  }
-}
-
-template <typename DeviceContext, typename T, size_t D, typename Functor>
-void ReduceGradFunctor(const DeviceContext& context,
-                       const phi::DenseTensor& input0,
-                       const phi::DenseTensor& input1,
-                       const phi::DenseTensor& input2,
-                       phi::DenseTensor* output,
-                       Functor functor,
-                       const std::vector<int>& dims) {
-  auto x = EigenTensor<T, D>::From(input0);
-  auto x_grad = EigenTensor<T, D>::From(*output);
-  auto x_rank = static_cast<int>(x.dimensions().size());
-  auto x_dims = input0.dims();
-  auto reduced_dims_v = common::vectorize(x_dims);
-  std::vector<int> dims_ref = dims;
-  Eigen::array<int, D> broadcast_dim;
-  for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1;
-
-  int broad_cats_times = 1;
-  for (size_t i = 0; i < dims_ref.size(); ++i) {
-    if (dims_ref[i] < 0) {
-      dims_ref[i] = x_rank + dims_ref[i];
-    }
-    reduced_dims_v[dims_ref[i]] = 1;
-    broadcast_dim[dims_ref[i]] = x_dims[dims_ref[i]];
-    broad_cats_times *= x_dims[dims_ref[i]];
-  }
-  auto reduced_dims = common::make_ddim(reduced_dims_v);
-  auto x_reduce = EigenTensor<T, D>::From(input1, reduced_dims);
-  auto x_reduce_grad = EigenTensor<T, D>::From(input2, reduced_dims);
-
-  auto& place = *context.eigen_device();
-
-  functor(place,
-          &x,
-          &x_reduce,
-          &x_grad,
-          &x_reduce_grad,
-          broadcast_dim,
-          broad_cats_times);
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/rrelu_op.cc b/paddle/fluid/operators/rrelu_op.cc
deleted file mode 100644
index 1d11c62b56956..0000000000000
--- a/paddle/fluid/operators/rrelu_op.cc
+++ /dev/null
@@ -1,129 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/infermeta/unary.h"
-
-namespace paddle::operators {
-
-class RReluOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          ctx.GetPlace());
-  }
-};
-
-class RReluOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The input of RReLU op.");
-    AddOutput("Out", "The output of RReLU op.");
-    AddOutput("Noise", "The random sampled RReLU noise.")
-        .AsIntermediate()
-        .AsExtra();
-    AddAttr<bool>("is_test",
-                  "(bool, default false) Set to true for inference only, false "
-                  "for training. Some layers may run faster when this is true.")
-        .SetDefault(false);
-    float default_lower = 1. / 8.;
-    AddAttr<float>("lower", "Lower bound of the uniform distribution.")
-        .SetDefault(default_lower)
-        .AddCustomChecker([](const float& lower) {
-          PADDLE_ENFORCE_EQ(lower >= 0.0f && lower < 1.0f,
-                            true,
-                            phi::errors::InvalidArgument(
-                                "'RRelu_lower' must be between 0.0 and 1.0."));
-        });
-    float defalut_upper = 1. / 3.;
-    AddAttr<float>("upper", "Upper bound of the uniform distribution.")
-        .SetDefault(defalut_upper)
-        .AddCustomChecker([](const float& upper) {
-          PADDLE_ENFORCE_EQ(upper > 0.0f && upper <= 1.0f,
-                            true,
-                            phi::errors::InvalidArgument(
-                                "'RRelu_upper' must be between 0.0 and 1.0."));
-        });
-    AddComment(R"DOC(
-RReLU Operator.
-
-Applies the randomized leaky rectified liner unit function, element-wise,
-as described in the paper:
-
-`Empirical Evaluation of Rectified Activations in Convolutional Network`_.
-
-The function is defined as:
-
-.. math::
-    \text{RReLU}(x) =
-    \begin{cases}
-        x & \text{if } x \geq 0 \\
-        ax & \text{ otherwise }
-    \end{cases}
-
-where :math:`a` is randomly sampled from uniform distribution
-:math:`\mathcal{U}(\text{lower}, \text{upper})`.
-
- See: https://arxiv.org/pdf/1505.00853.pdf
-
-)DOC");
-  }
-};
-
-class RReluGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-};
-
-template <typename T>
-class RReluGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("rrelu_grad");
-    op->SetInput("X", this->Input("X"));
-    op->SetInput("Noise", this->Output("Noise"));
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-  }
-};
-
-}  // namespace paddle::operators
-
-namespace ops = paddle::operators;
-DECLARE_INFER_SHAPE_FUNCTOR(rrelu,
-                            RReluInferShapeFunctor,
-                            PD_INFER_META(phi::RReluInferMeta));
-
-REGISTER_OPERATOR(rrelu,
-                  ops::RReluOp,
-                  ops::RReluOpMaker,
-                  ops::RReluGradOpMaker<paddle::framework::OpDesc>,
-                  ops::RReluGradOpMaker<paddle::imperative::OpBase>,
-                  RReluInferShapeFunctor);
-
-DECLARE_INFER_SHAPE_FUNCTOR(rrelu_grad,
-                            RReluGradInferShapeFunctor,
-                            PD_INFER_META(phi::RReluGradInferMeta));
-REGISTER_OPERATOR(rrelu_grad, ops::RReluGradOp, RReluGradInferShapeFunctor);
diff --git a/paddle/fluid/operators/tdm_child_op.cc b/paddle/fluid/operators/tdm_child_op.cc
deleted file mode 100644
index 6e3804fcb0a92..0000000000000
--- a/paddle/fluid/operators/tdm_child_op.cc
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include "paddle/fluid/operators/tdm_child_op.h"
-
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-class TDMChildOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "X(Tensor), dtype support int32/int64, X variable is the "
-             "node id of TDM-Tree");
-    AddInput(
-        "TreeInfo",
-        "TreeInfo(Tensor), dtype support int32/int64, it stores the node "
-        "information in the following format: item_id(shape=1), "
-        "layer_id(shape=1), parent_id(shape=1), child_id(shape=child_nums)");
-    AddAttr<int>("child_nums",
-                 "child_nums(int)"
-                 "The child nums of one node, if the node hasn't enough child, "
-                 "it should padding 0 until child nums equal to child_nums");
-    AddOutput("Child",
-              "Return the children's node_id of input node, "
-              "if input don't have child, return 0");
-    AddOutput("LeafMask",
-              "LeafMask has the same shape with Child"
-              "If child is leaf node, LeafMask value = 1, else = 0");
-    AddAttr<int>("dtype",
-                 "(int, default INT32) "
-                 "Output data type.")
-        .SetDefault(2);
-    AddComment(R"DOC("
-     **Tdm Child**
-     According to the input node_id on the given tree, return the corresponding child node_id and
-      whether child is a leaf node by LeafMask.")DOC");
-  }
-};
-
-class TDMChildOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"),
-                      true,
-                      phi::errors::InvalidArgument(
-                          "Inputs(X) of TdmChild should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("TreeInfo"),
-                      true,
-                      phi::errors::InvalidArgument(
-                          "Inputs(TreeInfo) of TdmChild should not be null."));
-
-    int child_nums = ctx->Attrs().Get<int>("child_nums");
-    PADDLE_ENFORCE_GT(
-        child_nums,
-        0,
-        phi::errors::InvalidArgument(
-            "ValueError: The value of the 'child_nums' must greater than 0. "
-            "But received child_nums value = %d, ",
-            child_nums));
-
-    auto info_dims = ctx->GetInputDim("TreeInfo");
-    auto input_dims = ctx->GetInputDim("X");
-
-    PADDLE_ENFORCE_EQ(
-        info_dims.size(),
-        2,
-        phi::errors::InvalidArgument(
-            "ShapeError: The dimensions of the 'tree info' must be 2. "
-            "But received tree info's dimensions = %d, "
-            "tree info's shape = [%s].",
-            info_dims.size(),
-            info_dims));
-
-    auto output_dims = common::vectorize(input_dims);
-    output_dims.push_back(child_nums);
-    ctx->SetOutputDim("Child", common::make_ddim(output_dims));
-    ctx->SetOutputDim("LeafMask", common::make_ddim(output_dims));
-
-    if (ctx->GetOutputsVarType("Child")[0] ==
-        framework::proto::VarType::LOD_TENSOR) {
-      ctx->ShareLoD("X", /*->*/ "Child");
-      ctx->ShareLoD("X", /*->*/ "LeafMask");
-    }
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
-    return phi::KernelKey(data_type, ctx.GetPlace());
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(
-    tdm_child,
-    ops::TDMChildOp,
-    ops::TDMChildOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
diff --git a/paddle/fluid/operators/tdm_child_op.h b/paddle/fluid/operators/tdm_child_op.h
deleted file mode 100644
index b645566736a9d..0000000000000
--- a/paddle/fluid/operators/tdm_child_op.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#pragma once
-
-#include <cmath>
-#include <fstream>
-#include <set>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "paddle/common/flags.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/mixed_vector.h"
-
-namespace paddle {
-namespace operators {}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/tdm_sampler_op.cc b/paddle/fluid/operators/tdm_sampler_op.cc
deleted file mode 100644
index db2dd6b4ced37..0000000000000
--- a/paddle/fluid/operators/tdm_sampler_op.cc
+++ /dev/null
@@ -1,136 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-
-class TDMSamplerOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "X(Tensor), Input variable which"
-             "mapping the leaf node idx of tdm tree,"
-             "dtype support int32/int64");
-    AddInput("Travel",
-             "Travel(Tensor), must has the same dtype with Layer"
-             "Contains path information of all leaf nodes to root node,"
-             " dtype support int32/64");
-    AddInput("Layer",
-             "Layer(Tensor), must has the same dtype with Travel "
-             "Indicates which nodes are in each layer");
-    AddAttr<bool>("output_positive",
-                  "output_positive(bool)"
-                  "Whether positive samples are included in the output")
-        .SetDefault(true);
-    AddAttr<std::vector<int>>(
-        "neg_samples_num_list",
-        "neg_samples_num_list(python:list[int], C++:vector<int>)"
-        "The num of negative samples in each layer")
-        .SetDefault({});
-    AddAttr<std::vector<int>>("layer_offset_lod",
-                              "offset lod information of Layer")
-        .SetDefault({});
-    AddAttr<int>("seed",
-                 "(int) The seed used in sampler. If it is 0, "
-                 "the sampler will generate a seed randomly.")
-        .SetDefault(0);
-    AddAttr<int>("dtype",
-                 "(int, default INT32) "
-                 "Output data type.")
-        .SetDefault(2);
-    AddOutput("Out",
-              "Sampling result lodTensor, with shape [batch_size, layer_num, "
-              "neg_num_of_layer]");
-    AddOutput("Labels",
-              "Labels of sampling result, has the same shape with Out."
-              "pos samples mapping value 1, neg sample mapping value 0")
-        .AsDispensable();
-    AddOutput(
-        "Mask",
-        "Padding flag of Sampling result, if sampling res comes from padding,"
-        "it will be 0, else 1, lodTensor, with shape [batch_size, "
-        "layer_num, neg_num_of_layer]");
-    AddComment(R"DOC("
-        **TDM Sampler**
-        According to the input positive samples at leaf node, do negative sampling layer by layer on the given tree.")DOC");
-  }
-};
-
-class TDMSamplerOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"),
-                      true,
-                      phi::errors::InvalidArgument(
-                          "Inputs(Input) of TdmSampler should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Travel"),
-                      true,
-                      phi::errors::InvalidArgument(
-                          "Inputs(Travel) of TdmSampler should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Layer"),
-                      true,
-                      phi::errors::InvalidArgument(
-                          "Inputs(Layer) of TdmSampler should not be null."));
-    auto neg_samples_num_vec =
-        ctx->Attrs().Get<std::vector<int>>("neg_samples_num_list");
-    auto output_positive_flag = ctx->Attrs().Get<bool>("output_positive");
-
-    int64_t sample_res_length = 0;
-    for (auto sample_nums : neg_samples_num_vec) {
-      sample_res_length += sample_nums + (int64_t)output_positive_flag;
-    }
-
-    auto input_dims = ctx->GetInputDim("X");
-    auto ddim = common::make_ddim({-1, sample_res_length});
-    if (ctx->IsRuntime()) {
-      auto output_dims = common::vectorize(input_dims);
-      auto batch_size = output_dims[0];
-      ctx->SetOutputDim("Out",
-                        common::make_ddim({batch_size, sample_res_length}));
-      ctx->SetOutputDim("Labels",
-                        common::make_ddim({batch_size, sample_res_length}));
-      ctx->SetOutputDim("Mask",
-                        common::make_ddim({batch_size, sample_res_length}));
-    } else {
-      ctx->SetOutputDim("Out", ddim);
-      ctx->SetOutputDim("Labels", ddim);
-      ctx->SetOutputDim("Mask", ddim);
-    }
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
-    return phi::KernelKey(data_type, ctx.GetPlace());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(
-    tdm_sampler,
-    ops::TDMSamplerOp,
-    ops::TDMSamplerOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
diff --git a/paddle/fluid/operators/transfer_layout_op.cc b/paddle/fluid/operators/transfer_layout_op.cc
deleted file mode 100644
index 19334ca2dad6a..0000000000000
--- a/paddle/fluid/operators/transfer_layout_op.cc
+++ /dev/null
@@ -1,138 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/transfer_layout_op.h"
-
-#include <string>
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/unary.h"
-
-namespace paddle {
-namespace framework {
-class OpDesc;
-class InferShapeContext;
-template <typename T>
-class EmptyGradOpMaker;
-}  // namespace framework
-namespace imperative {
-class OpBase;
-}  // namespace imperative
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-
-class TransferLayoutOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    // kernel's device type is decided by input tensor place
-    auto *in = ctx.InputVar("X");
-    auto *in_tensor = framework::GetLoDTensorOrSelectedRowsValueFromVar(*in);
-    // NOTE(zhiqiu): hot fix, allow empty tensor of kMKLDNN layout to run this
-    // op
-    if (in_tensor->layout() != DataLayout::ONEDNN) {
-      PADDLE_ENFORCE_EQ(in_tensor->IsInitialized(),
-                        true,
-                        phi::errors::PreconditionNotMet(
-                            "The tensor of Input(X) is not initialized."));
-    }
-    auto place =
-        in_tensor->IsInitialized() ? in_tensor->place() : phi::CPUPlace();
-    phi::DataType dtype = in_tensor->IsInitialized() ? in_tensor->dtype()
-                                                     : phi::DataType::FLOAT32;
-    return phi::KernelKey(phi::TransToProtoVarType(dtype), place);
-  }
-
-  phi::KernelKey GetKernelTypeForVar(
-      const std::string &var_name,
-      const phi::DenseTensor &tensor,
-      const phi::KernelKey &expected_kernel_type) const override {
-    return phi::KernelKey(phi::Backend::ALL_BACKEND,
-                          expected_kernel_type.layout(),
-                          expected_kernel_type.dtype());
-  }
-};
-
-class TransferLayoutInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    ctx->SyncTypeAndDataType("X", "Out");
-  }
-};
-
-class TransferLayoutKernel {
- public:
-  void operator()(const framework::ExecutionContext &ctx) const {
-    auto *x = ctx.InputVar("X");
-    auto *out = ctx.OutputVar("Out");
-    auto &dev_ctx = ctx.device_context();
-    auto src_layout = ctx.Attr<int>("src_layout");
-    auto dst_layout = ctx.Attr<int>("dst_layout");
-    auto input_name = ctx.InputName("X");
-    TransferLayoutFunctor(
-        x, out, dev_ctx, src_layout, dst_layout, input_name)();
-  }
-};
-
-class TransferLayoutOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(phi::DenseTensor) The input Tensor");
-    AddOutput("Out",
-              "(phi::DenseTensor) The Output Tensor with desired layout");
-    // NOTE(zhiqiu): in most case, the src_layout is not needed, the op can use
-    // the layout
-    // of input X. However, in some mkldnn kernel, the src layout computed by
-    // GetKernelTypeForVar is different with the layout of tensor X.
-    AddAttr<int>("src_layout",
-                 "kAnyLayout = 0, kNHWC = 1, kNCHW = 2, kMKLDNN = 3, default "
-                 "-1 means unspecified and use the tensor's layout.")
-        .SetDefault(-1);
-    AddAttr<int>("dst_layout",
-                 "kAnyLayout = 0, kNHWC = 1, kNCHW = 2, kMKLDNN = 3");
-    AddComment(R"DOC(
-    TransferLayout Operator)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-DECLARE_INFER_SHAPE_FUNCTOR(transfer_layout,
-                            TransferLayoutInferShapeFunctor,
-                            PD_INFER_META(phi::TransferLayoutInferMeta));
-REGISTER_OPERATOR(
-    transfer_layout,
-    ops::TransferLayoutOp,
-    ops::TransferLayoutOpProtoMaker,
-    ops::TransferLayoutInferVarType,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    TransferLayoutInferShapeFunctor);
-
-REGISTER_OP_VERSION(transfer_layout)
-    .AddCheckpoint(R"ROC(refine transfer_layout, add src_layout attribute)ROC",
-                   paddle::framework::compatible::OpVersionDesc().NewAttr(
-                       "src_layout",
-                       "(int, the layout of the input tensor",
-                       -1));
diff --git a/paddle/fluid/operators/transfer_layout_op.h b/paddle/fluid/operators/transfer_layout_op.h
deleted file mode 100644
index 1b4ef2d1b5abb..0000000000000
--- a/paddle/fluid/operators/transfer_layout_op.h
+++ /dev/null
@@ -1,169 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/framework/data_layout_transform.h"
-#include "paddle/fluid/framework/data_transform.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace phi {
-class DenseTensor;
-}  // namespace phi
-
-namespace paddle {
-namespace framework {
-class Variable;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-using DataLayout = phi::DataLayout;
-
-class TransferLayoutFunctor {
- public:
-  TransferLayoutFunctor(const framework::Variable *in,
-                        framework::Variable *out,
-                        const platform::DeviceContext &dev_ctx,
-                        const int src_layout,
-                        const int dst_layout,
-                        std::string in_name)
-      : in_(in),
-        out_(out),
-        dev_ctx_(dev_ctx),
-        src_layout_(src_layout),
-        dst_layout_(dst_layout),
-        in_name_(in_name) {}
-
-  void operator()() const {
-    auto &in_tensor = *framework::GetLoDTensorOrSelectedRowsValueFromVar(*in_);
-    phi::DenseTensor out_tensor;
-
-    auto out_layout = static_cast<DataLayout>(dst_layout_);
-    out_tensor.set_layout(out_layout);
-
-#ifdef PADDLE_WITH_DNNL
-    // NOTE(zhiqiu): to handle the special case in ApplyDataTransform() in
-    // data_transfer.cc
-    auto in_layout = static_cast<DataLayout>(src_layout_);
-    auto *tensor_out = out_->GetMutable<phi::DenseTensor>();
-    VLOG(4) << in_layout << "->" << out_layout << " " << in_tensor.layout();
-    if (!in_tensor.IsInitialized() && in_layout == DataLayout::ONEDNN &&
-        out_layout == DataLayout::kNHWC) {
-      tensor_out->Resize(in_tensor.dims());
-      tensor_out->set_layout(out_layout);
-      phi::funcs::MatchShapeToLayout(tensor_out, in_layout, out_layout);
-      return;
-    }
-    if (in_layout == DataLayout::ONEDNN || out_layout == DataLayout::ONEDNN) {
-      PADDLE_ENFORCE_NE(
-          in_layout,
-          out_layout,
-          phi::errors::PreconditionNotMet(
-              "No layout transform needed between two oneDNN OPKernels."));
-
-      if (in_layout != DataLayout::ONEDNN && out_layout == DataLayout::ONEDNN) {
-        // Case1 - transform from Non-ONEDNN OPKernel to ONEDNN OPKernel
-        // Just set layout/format. No real transform occur
-
-        auto out_format = phi::funcs::OneDNNFormatForSize(
-            in_tensor.dims().size(), phi::funcs::ToOneDNNFormat(in_layout));
-        out_tensor.ShareDataWith(in_tensor);
-        // For NHWC data we need reshape of tensors as MKL-DNN
-        // is expecting NHWC dims description order
-        if (in_layout == DataLayout::kNHWC) {
-          VLOG(4) << "kNHWC";
-          phi::funcs::MatchShapeToLayout(&out_tensor, in_layout, out_layout);
-          phi::OneDNNContext::tls().set_cur_paddle_data_layout(in_layout);
-        }
-        auto out_tz = out_tensor.dims().size() == 0
-                          ? std::vector<int64_t>{1}
-                          : common::vectorize(out_tensor.dims());
-        dnnl::memory::data_type in_type =
-            phi::funcs::ToOneDNNDataType(in_tensor.dtype());
-
-        dnnl::memory::desc out_mem_desc(out_tz, in_type, out_format);
-        out_tensor.set_mem_desc(out_mem_desc);
-      } else {
-        auto target_layout =
-            phi::OneDNNContext::tls().get_cur_paddle_data_layout();
-        // NOTE(zhiqiu): hot fix, follow the same logic in DataCopy() in
-        // fetch_op.cc
-        if (out_layout == DataLayout::kNCHW &&
-            in_name_ == framework::GradVarName("Filter")) {
-          target_layout = out_layout;
-        }
-        VLOG(4) << "TransDataLayoutFromOneDNN: " << in_layout << "->"
-                << target_layout;
-        // Case2 - transform from ONEDNN OPKernel to Non-ONEDNN OPKernel
-        // Do transform via ONEDNN lib
-        phi::funcs::TransDataLayoutFromOneDNN(in_layout,
-                                              target_layout,
-                                              in_tensor,
-                                              &out_tensor,
-                                              dev_ctx_.GetPlace());
-      }
-    } else {
-      // Case3 - transform between Non-ONEDNN OPKernels
-      TransDataLayout(dev_ctx_, in_tensor, &out_tensor);
-    }
-#else
-    // Case3 - transform between Non-ONEDNN OPKernels
-    TransDataLayout(dev_ctx_, in_tensor, &out_tensor);
-#endif
-    framework::SetTensorToVariable(*in_, out_tensor, out_);
-  }
-
- private:
-  void TransDataLayout(const platform::DeviceContext &dev_ctx,
-                       const phi::DenseTensor &in,
-                       phi::DenseTensor *out) const {
-    PADDLE_ENFORCE_EQ(
-        common::arity(in.dims()),
-        4,
-        phi::errors::InvalidArgument(
-            "Input dimension arity only can be 4, the input dimension is %s.",
-            in.dims()));
-
-    auto src_dim = in.dims();
-    std::vector<int64_t> dst_dim;
-
-    auto axis = framework::GetAxis(in.layout(), out->layout());
-    dst_dim.resize(axis.size());
-    for (size_t i = 0; i < axis.size(); i++) {
-      dst_dim[i] = src_dim[axis[i]];
-    }
-
-    out->Resize(common::make_ddim(dst_dim));
-    out->mutable_data(in.place(), in.type());
-
-    framework::VisitDataType(
-        framework::TransToProtoVarType(in.dtype()),
-        framework::CastDataLayout(&dev_ctx, axis, in, out));
-  }
-
-  const framework::Variable *in_;
-  framework::Variable *out_;
-  const platform::DeviceContext &dev_ctx_;
-  const int src_layout_;
-  const int dst_layout_;
-  std::string in_name_;
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/pir/dialect/CMakeLists.txt b/paddle/fluid/pir/dialect/CMakeLists.txt
index f9b7948de3329..6832b8f9fff2c 100644
--- a/paddle/fluid/pir/dialect/CMakeLists.txt
+++ b/paddle/fluid/pir/dialect/CMakeLists.txt
@@ -79,6 +79,9 @@ set(op_src_files_tmp
 
 set(op_vjp_src_file_tmp ${op_vjp_source_file_tmp})
 
+set(op_cc_split_num 4)
+set(bwd_op_cc_split_num 2)
+
 # Auto code gen
 execute_process(
   COMMAND ${PYTHON_EXECUTABLE} ${op_parse_file} --op_yaml_path
@@ -95,15 +98,22 @@ execute_process(
     --op_compat_yaml_file ${op_compat_yaml_file} --namespaces ${op_namespace}
     --dialect_name ${dialect_name} --op_def_h_file ${op_header_file_tmp}
     --op_info_file ${op_info_file_tmp} --op_def_cc_file ${op_src_files_tmp}
-    --op_vjp_cc_file ${op_vjp_src_file_tmp} --with_distributed
-    ${WITH_DISTRIBUTE})
+    --op_vjp_cc_file ${op_vjp_src_file_tmp} --op_cc_split_num
+    ${op_cc_split_num} --bwd_op_cc_split_num ${bwd_op_cc_split_num}
+    --with_distributed ${WITH_DISTRIBUTE})
+
+set(split_op_source_files
+    ${PIR_DIALECT_BINARY_DIR}/pd_op1.cc ${PIR_DIALECT_BINARY_DIR}/pd_op2.cc
+    ${PIR_DIALECT_BINARY_DIR}/pd_op3.cc ${PIR_DIALECT_BINARY_DIR}/pd_op4.cc)
+set(split_bwd_op_source_files ${PIR_DIALECT_BINARY_DIR}/pd_op_bwd1.cc
+                              ${PIR_DIALECT_BINARY_DIR}/pd_op_bwd2.cc)
 
 set(generated_files_pd_op
     "${op_header_file}"
     "${op_info_file}"
-    "${op_source_file}"
+    "${split_op_source_files}"
+    "${split_bwd_op_source_files}"
     "${op_vjp_source_file}"
-    "${bwd_op_source_file}"
     "${fused_op_source_file}"
     "${bwd_fused_op_source_file}"
     "${pir_op_source_file}"
@@ -247,8 +257,8 @@ set(op_dialect_srcs
     ${CMAKE_CURRENT_SOURCE_DIR}/operator/ir/op_attribute.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/operator/ir/op_type.cc
     ${op_info_file}
-    ${op_source_file}
-    ${bwd_op_source_file}
+    ${split_op_source_files}
+    ${split_bwd_op_source_files}
     ${fused_op_source_file}
     ${bwd_fused_op_source_file}
     ${pir_op_source_file}
diff --git a/paddle/fluid/pir/dialect/distributed/ir/dist_tools.cc b/paddle/fluid/pir/dialect/distributed/ir/dist_tools.cc
index 505b178a452b0..db7089e32177b 100644
--- a/paddle/fluid/pir/dialect/distributed/ir/dist_tools.cc
+++ b/paddle/fluid/pir/dialect/distributed/ir/dist_tools.cc
@@ -17,8 +17,7 @@
 #include "paddle/common/enforce.h"
 #include "paddle/pir/include/core/operation.h"
 
-namespace paddle {
-namespace dialect {
+namespace paddle::dialect {
 
 bool AllInputAreDist(const std::vector<pir::Value>& inputs) {
   for (auto value : inputs) {
@@ -159,10 +158,10 @@ pir::Attribute CreateReplicatedDistAttr(pir::Type prim_type,
   }
   return nullptr;
 }
-pir::Type CvtToPirDistType(pir::Type prim_type, pir::Attribute dist_attr) {
-  if (!prim_type) return nullptr;
+pir::Type CvtToPirDistType(pir::Type global_type, pir::Attribute dist_attr) {
+  if (!global_type) return nullptr;
   auto ctx = pir::IrContext::Instance();
-  if (auto dense_tensor_type = prim_type.dyn_cast<pir::DenseTensorType>()) {
+  if (auto dense_tensor_type = global_type.dyn_cast<pir::DenseTensorType>()) {
     auto tensor_dist_attr = dist_attr.dyn_cast<TensorDistAttribute>();
     if (!tensor_dist_attr) {
       VLOG(0) << "Convert dense tensor type to dist type with attribute {"
@@ -172,7 +171,7 @@ pir::Type CvtToPirDistType(pir::Type prim_type, pir::Attribute dist_attr) {
           "with non-empty TensorDistAttr"));
     }
     return DistDenseTensorType::get(ctx, dense_tensor_type, tensor_dist_attr);
-  } else if (auto vec_type = prim_type.dyn_cast<pir::VectorType>()) {
+  } else if (auto vec_type = global_type.dyn_cast<pir::VectorType>()) {
     auto array_attr = dist_attr.dyn_cast<pir::ArrayAttribute>();
     if (!array_attr) {
       VLOG(0) << "Convert vector type to dist type with attribute {"
@@ -192,8 +191,8 @@ pir::Type CvtToPirDistType(pir::Type prim_type, pir::Attribute dist_attr) {
     }
     return pir::VectorType::get(ctx, dist_vec_type);
   } else {
-    VLOG(0) << "Convert type{" << prim_type << "} to dist type with attribute {"
-            << dist_attr << "}.";
+    VLOG(0) << "Convert type{" << global_type
+            << "} to dist type with attribute {" << dist_attr << "}.";
     PADDLE_THROW(common::errors::InvalidArgument(
         "Currently only support convert dense_tensor_type r vector type to "
         "dist."));
@@ -225,5 +224,4 @@ void CopyLeafOpToMesh(pir::Value value, ProcessMeshAttribute mesh_attr) {
     }
   }
 }
-}  // namespace dialect
-}  // namespace paddle
+}  // namespace paddle::dialect
diff --git a/paddle/fluid/pir/dialect/distributed/ir/dist_tools.h b/paddle/fluid/pir/dialect/distributed/ir/dist_tools.h
index a50331a8ea395..10f76a86e600d 100644
--- a/paddle/fluid/pir/dialect/distributed/ir/dist_tools.h
+++ b/paddle/fluid/pir/dialect/distributed/ir/dist_tools.h
@@ -37,7 +37,7 @@ pir::Attribute CvtToPirAttr(const phi::distributed::ArgDistAttr& dist_attr);
 pir::Attribute CreateReplicatedDistAttr(pir::Type prim_type,
                                         ProcessMeshAttribute mesh);
 
-pir::Type CvtToPirDistType(pir::Type prim_type, pir::Attribute dist_attr);
+pir::Type CvtToPirDistType(pir::Type global_type, pir::Attribute dist_attr);
 
 ///
 /// When the following conditions are met:
diff --git a/paddle/fluid/pir/dialect/distributed/transforms/fuse_allreduce_split_to_reducescatter_pass.cc b/paddle/fluid/pir/dialect/distributed/transforms/fuse_allreduce_split_to_reducescatter_pass.cc
index 4191eaa4bce50..5d1a9b87431f1 100644
--- a/paddle/fluid/pir/dialect/distributed/transforms/fuse_allreduce_split_to_reducescatter_pass.cc
+++ b/paddle/fluid/pir/dialect/distributed/transforms/fuse_allreduce_split_to_reducescatter_pass.cc
@@ -35,7 +35,11 @@ class FusedAllReduceSplitPattern : public paddle::drr::DrrPatternBase {
     const auto &c_allreduce_sum_ =
         pat.Op(paddle::dialect::CAllreduceSum_Op::name(),
                {{"ring_id", pat.Attr("ring_id")},
-                {"use_calc_stream", pat.Attr("use_calc_stream")}});
+                {"use_calc_stream", pat.Attr("use_calc_stream")},
+                {"execution_stream", pat.Attr("execution_stream")},
+                {"force_record_event", pat.Attr("force_record_event")},
+                {"event_to_record", pat.Attr("event_to_record")},
+                {"events_to_wait", pat.Attr("events_to_wait")}});
     const auto &assign = pat.Op(paddle::dialect::AssignOp::name());
     const auto &full = pat.Op(paddle::dialect::FullOp::name());
     const auto &split_with_num = pat.Op(paddle::dialect::SplitWithNumOp::name(),
@@ -74,7 +78,11 @@ class FusedAllReduceSplitPattern : public paddle::drr::DrrPatternBase {
         res.Op(paddle::dialect::CReducescatterOp::name(),
                {{"ring_id", pat.Attr("ring_id")},
                 {"nranks", pat.Attr("num")},
-                {"use_calc_stream", pat.Attr("use_calc_stream")}});
+                {"use_calc_stream", pat.Attr("use_calc_stream")}},
+               {{"execution_stream", pat.Attr("execution_stream")},
+                {"force_record_event", pat.Attr("force_record_event")},
+                {"event_to_record", pat.Attr("event_to_record")},
+                {"events_to_wait", pat.Attr("events_to_wait")}});
 
     c_reducescatter({&res.Tensor("input_grad_partial")}, {&res.Tensor("out")});
   }
diff --git a/paddle/fluid/pir/dialect/op_generator/op_gen.py b/paddle/fluid/pir/dialect/op_generator/op_gen.py
index 36d3a26f680a0..ed4b1bae54650 100644
--- a/paddle/fluid/pir/dialect/op_generator/op_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/op_gen.py
@@ -14,6 +14,7 @@
 
 import argparse
 import logging
+import math
 import os
 import pathlib
 import sys
@@ -1130,6 +1131,21 @@ def get_mutable_attribute_grad_semantic(op_info, op_info_items):
     return mutable_attribute_grad_semantics
 
 
+def split_ops(op_info_items: dict, cc_file, split_nums):
+    op_list = list(op_info_items.keys())
+    ops_max_size = math.ceil(len(op_list) / split_nums)
+    split_op_info_items = []
+    for i in range(split_nums):
+        split_op_info_items.append({})
+    for i, op_name in enumerate(op_list):
+        list_idx = math.ceil((i + 1) / ops_max_size) - 1
+        split_op_info_items[list_idx][op_name] = op_info_items[op_name]
+    split_cc_files = []
+    for i in range(split_nums):
+        split_cc_files.append(cc_file.replace(".cc", f"{i + 1}.cc"))
+    return split_op_info_items, split_cc_files
+
+
 def GenOneDnnExtraAttrsDefaultValue(onednn_extra_args):
     INTARRAY_STR_TEMPLATE = """  pir::Attribute attr_{attr_name} = {op_attribute_type}::get(pir::IrContext::Instance(), phi::IntArray({attr}));
 """
@@ -2080,6 +2096,8 @@ def OpGenerator(
     op_info_file,
     op_def_cc_file,
     op_vjp_cc_file,
+    op_cc_split_num,
+    bwd_op_cc_split_num,
     onednn_yaml_file,
     ops_onednn_extra_yaml_file,
 ):
@@ -2126,9 +2144,11 @@ def OpGenerator(
 
     op_infos = []
     all_op_info_items = {}
+    new_op_def_cc_file = []
     first_file = True
     onednn_only_op_list = []
-    for yaml_file in op_yaml_files:
+    for idx in range(len(op_yaml_files)):
+        yaml_file = op_yaml_files[idx]
         op_yaml_items = []
         with open(yaml_file, "r") as f:
             ops = yaml.safe_load(f)
@@ -2194,13 +2214,37 @@ def OpGenerator(
             key_suffix = '_sp' if item.is_sparse_op else ''
             op_info_items[op['name'] + key_suffix] = item
             all_op_info_items[op['name'] + key_suffix] = item
-        op_infos.append(op_info_items)
+
+        if dialect_name != "onednn_op":
+            cc_file = op_def_cc_file[idx]
+            if (
+                yaml_file.split('/')[-1] == "ops.parsed.yaml"
+                and op_cc_split_num is not None
+            ):
+                split_op_info_items, split_cc_files = split_ops(
+                    op_info_items, cc_file, op_cc_split_num
+                )
+                op_infos.extend(split_op_info_items)
+                new_op_def_cc_file.extend(split_cc_files)
+            elif (
+                yaml_file.split('/')[-1] == "backward.parsed.yaml"
+                and bwd_op_cc_split_num is not None
+            ):
+                split_op_info_items, split_cc_files = split_ops(
+                    op_info_items, cc_file, bwd_op_cc_split_num
+                )
+                op_infos.extend(split_op_info_items)
+                new_op_def_cc_file.extend(split_cc_files)
+            else:
+                op_infos.append(op_info_items)
+                new_op_def_cc_file.append(cc_file)
 
         if first_file:
             first_file = False
 
     if dialect_name == "onednn_op":
         op_infos = [all_op_info_items]
+        new_op_def_cc_file = op_def_cc_file
     # (3) auto code gen
     op_list_strs = []
     declare_type_id_strs = []
@@ -2329,7 +2373,7 @@ def OpGenerator(
             f.write(op_info_str)
 
     # (6) write to files for xx_op.cc.tmp
-    for id in range(len(op_def_cc_file)):
+    for id in range(len(new_op_def_cc_file)):
         source_file_str = source_file_strs[id]
         for name in reversed(namespaces):
             source_file_str = NAMESPACE_GARD_TEMPLATE.format(
@@ -2349,7 +2393,7 @@ def OpGenerator(
             input=source_file_str,
             define_type_id=define_type_id_strs[id],
         )
-        with open(op_def_cc_file[id], 'w') as f:
+        with open(new_op_def_cc_file[id], 'w') as f:
             f.write(source_file_str)
 
     # (6) write to files for xx_vjp_op.cc.tmp
@@ -2381,6 +2425,8 @@ def ParseArguments():
     parser.add_argument('--op_info_file', type=str)
     parser.add_argument('--op_def_cc_file', type=str)
     parser.add_argument('--op_vjp_cc_file', type=str)
+    parser.add_argument('--op_cc_split_num', type=int)
+    parser.add_argument('--bwd_op_cc_split_num', type=int)
     parser.add_argument('--onednn_yaml_file', type=str)
     parser.add_argument('--ops_onednn_extra_yaml_file', type=str)
     parser.add_argument('--with_distributed', type=strtobool)
@@ -2403,6 +2449,8 @@ def ParseArguments():
     op_info_file = args.op_info_file
     op_def_cc_files = args.op_def_cc_file.split(",")
     op_vjp_cc_file = args.op_vjp_cc_file
+    op_cc_split_num = args.op_cc_split_num
+    bwd_op_cc_split_num = args.bwd_op_cc_split_num
     onednn_yaml_file = args.onednn_yaml_file
     ops_onednn_extra_yaml_file = args.ops_onednn_extra_yaml_file
 
@@ -2417,6 +2465,8 @@ def ParseArguments():
         op_info_file,
         op_def_cc_files,
         op_vjp_cc_file,
+        op_cc_split_num,
+        bwd_op_cc_split_num,
         onednn_yaml_file,
         ops_onednn_extra_yaml_file,
     )
diff --git a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
index 090aab4e3c4ed..95f104b76da51 100644
--- a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
@@ -168,6 +168,7 @@
     'fused_elementwise_div',
     'fused_elementwise_mul',
     'fused_elementwise_sub',
+    'fusion_group',
     'fusion_seqpool_cvm_concat',
     'nce',
     'lars_momentum',
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc
index 5c7f01606c2df..777868c691c74 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc
@@ -274,7 +274,7 @@ bool ConcatOpInferSymbolicShape(pir::Operation *op,
       SetShapeOrDataForAxis(axis);
     } else {
       pir::Value res = op->result(0);
-      infer_context->SetStaticShapeForValue(res);
+      infer_context->SetSymbolForValueByStaticShape(res);
       // update axis value
       auto res_shape = infer_context->GetShapeOrDataForValue(res);
       for (size_t i = 0; i < rank; ++i) {
diff --git a/paddle/fluid/pir/dialect/operator/interface/layout_transformation.cc b/paddle/fluid/pir/dialect/operator/interface/layout_transformation.cc
index 783e56a3c505e..53c71c3fa0122 100644
--- a/paddle/fluid/pir/dialect/operator/interface/layout_transformation.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/layout_transformation.cc
@@ -38,6 +38,45 @@ void RewriteByInfermeta(pir::Operation* op, common::DataLayout new_layout) {
   }
 }
 
+template <>
+std::vector<pir::Value> RelevantInputsImpl<AddGroupNormSiluOp>(
+    pir::Operation* op) {
+  auto concrete_op = op->dyn_cast<AddGroupNormSiluOp>();
+  return {concrete_op.x(), concrete_op.residual()};
+}
+
+template <>
+std::vector<pir::Value> RelevantOutputsImpl<AddGroupNormSiluOp>(
+    pir::Operation* op) {
+  auto concrete_op = op->dyn_cast<AddGroupNormSiluOp>();
+  return {concrete_op.y(), concrete_op.residual_out()};
+}
+
+template <>
+common::DataLayout PreferLayoutImpl<AddGroupNormSiluOp>(pir::Operation* op) {
+  // Note(bukejiyu): add_group_norm_silu only supports NHWC layout now.
+  return common::DataLayout::NHWC;
+}
+
+template <>
+void RewriteByLayoutImpl<AddGroupNormSiluOp>(pir::Operation* op,
+                                             common::DataLayout new_layout) {
+  op->set_attribute(
+      "data_format",
+      pir::StrAttribute::get(pir::IrContext::Instance(),
+                             common::DataLayoutToString(new_layout)));
+
+  std::vector<pir::Type> new_outputs = AddGroupNormSiluOp::InferMeta(
+      op->operands_source(), const_cast<pir::AttributeMap*>(&op->attributes()));
+  for (size_t i = 0; i < new_outputs.size(); ++i) {
+    op->result(i).set_type(new_outputs[i]);
+  }
+
+  for (auto value : RelevantOutputsImpl<AddGroupNormSiluOp>(op)) {
+    SetNewLayoutForValue(value, new_layout);
+  }
+}
+
 template <>
 common::DataLayout PreferLayoutImpl<Conv2dOp>(pir::Operation* op) {
   auto data_format_attr = op->attribute<pir::StrAttribute>("data_format");
@@ -48,11 +87,30 @@ common::DataLayout PreferLayoutImpl<Conv2dOp>(pir::Operation* op) {
         data_format_attr));
   }
 
-  // Note(lyk): We exhibit the layout transformation for conv2d
+  auto concrete_op = op->dyn_cast<Conv2dOp>();
+  if (auto in = concrete_op.input()) {
+    if (auto in_type = in.type()) {
+      if (in_type.isa<DenseTensorType>()) {
+        if (auto tensor_type = in_type.dyn_cast<DenseTensorType>()) {
+          if (tensor_type.dtype().isa<pir::Float16Type>()) {
+            return common::DataLayout::NHWC;
+          }
+        }
+      }
+    }
+  }
+
+  return common::StringToDataLayout(data_format_attr.AsString());
+}
+
+template <>
+std::vector<pir::Value> RelevantInputsImpl<Conv2dOp>(pir::Operation* op) {
+  // Note(lyk): We exhibit the layout transformation for filter of conv2d
   // due to issues with its infermeta and kernel not functioning
   // properly in NHWC layout. However, if the FLAGS_manually_trans_conv_filter
   // is enabled, the transfer_layout_pass can also operate correctly.
-  return common::StringToDataLayout(data_format_attr.AsString());
+  auto concrete_op = op->dyn_cast<Conv2dOp>();
+  return {concrete_op.input()};
 }
 
 template <>
@@ -78,6 +136,14 @@ common::DataLayout PreferLayoutImpl<FusedConv2dAddActOp>(pir::Operation* op) {
   auto original_layout =
       common::StringToDataLayout(data_format_attr.AsString());
 
+  if (op->HasAttribute(kForceBackendAttr) &&
+      op->attributes()
+              .at(kForceBackendAttr)
+              .dyn_cast<pir::StrAttribute>()
+              .AsString() == "gpu") {
+    return common::DataLayout::NHWC;
+  }
+
   auto concrete_op = op->dyn_cast<FusedConv2dAddActOp>();
   if (auto in = concrete_op.input()) {
     if (auto in_type = in.type()) {
@@ -124,6 +190,31 @@ void RewriteByLayoutImpl<FusedConv2dAddActOp>(pir::Operation* op,
   RewriteByInfermeta<FusedConv2dAddActOp>(op, new_layout);
 }
 
+template <>
+bool CanBeModifiedImpl<FusedConv2dAddActOp>(pir::Operation* op) {
+  auto data_format_attr = op->attribute<pir::StrAttribute>("data_format");
+  if (!data_format_attr) {
+    PADDLE_THROW(phi::errors::InvalidArgument(
+        "op (%s) should have attribute `data_format`, but got %s",
+        op,
+        data_format_attr));
+  }
+  auto cur_layout = common::StringToDataLayout(data_format_attr.AsString());
+  auto prefer_layout = PreferLayoutImpl<FusedConv2dAddActOp>(op);
+  auto can_be_modified = cur_layout != prefer_layout;
+
+  for (auto value : RelevantOutputsImpl<FusedConv2dAddActOp>(op)) {
+    // TODO(lyk) if value was used in another block, we cannot rewrite this op
+    for (auto it = value.use_begin(); it != value.use_end(); ++it) {
+      if (it->owner()->GetParent() != op->GetParent()) {
+        return false;
+      }
+    }
+  }
+
+  return can_be_modified;
+}
+
 template <>
 void RewriteByLayoutImpl<GroupNormOp>(pir::Operation* op,
                                       common::DataLayout new_layout) {
diff --git a/paddle/fluid/pir/dialect/operator/interface/layout_transformation.hpp b/paddle/fluid/pir/dialect/operator/interface/layout_transformation.hpp
index 05719bc1dfb2f..fe0f7b440772e 100644
--- a/paddle/fluid/pir/dialect/operator/interface/layout_transformation.hpp
+++ b/paddle/fluid/pir/dialect/operator/interface/layout_transformation.hpp
@@ -105,9 +105,11 @@ bool CanBeModifiedImpl(pir::Operation* op) {
 class FusedConv2dAddActOp;
 OVERLOAD_PREFER_LAYOUT(FusedConv2dAddActOp);
 OVERLOAD_REWRITE_BY_LAYOUT(FusedConv2dAddActOp);
+OVERLOAD_CAN_BE_MODIFIED(FusedConv2dAddActOp);
 
 class Conv2dOp;
 OVERLOAD_PREFER_LAYOUT(Conv2dOp);
+OVERLOAD_RELEVANT_INPUTS(Conv2dOp);
 OVERLOAD_REWRITE_BY_LAYOUT(Conv2dOp);
 
 class GroupNormOp;
@@ -115,6 +117,12 @@ OVERLOAD_REWRITE_BY_LAYOUT(GroupNormOp);
 OVERLOAD_RELEVANT_INPUTS(GroupNormOp);
 OVERLOAD_RELEVANT_OUTPUTS(GroupNormOp);
 
+class AddGroupNormSiluOp;
+OVERLOAD_REWRITE_BY_LAYOUT(AddGroupNormSiluOp);
+OVERLOAD_PREFER_LAYOUT(AddGroupNormSiluOp);
+OVERLOAD_RELEVANT_INPUTS(AddGroupNormSiluOp);
+OVERLOAD_RELEVANT_OUTPUTS(AddGroupNormSiluOp);
+
 class ReshapeOp;
 OVERLOAD_RELEVANT_INPUTS(ReshapeOp);
 OVERLOAD_RELEVANT_OUTPUTS(ReshapeOp);
diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_api.cc b/paddle/fluid/pir/dialect/operator/ir/manual_api.cc
index 7fb835dd01c90..2d705364b970f 100644
--- a/paddle/fluid/pir/dialect/operator/ir/manual_api.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_api.cc
@@ -71,11 +71,11 @@ void set_parameter(const pir::Value& parameter, const std::string& name) {
   }
 }
 
-void updata_parameter(const pir::Value& parameter, const std::string& name) {
+void update_parameter(const pir::Value& parameter, const std::string& name) {
   pir::Parameter* param = ApiBuilder::Instance().GetParameter(name);
   PADDLE_ENFORCE_NOT_NULL(param,
                           phi::errors::InvalidArgument(
-                              "Parameter %s not exist, can not updata.", name));
+                              "Parameter %s not exist, can not update.", name));
   std::unique_ptr<pir::Parameter> param_new(
       new pir::Parameter(nullptr, 0, parameter.type()));
   ApiBuilder::Instance().SetParameter(name, std::move(param_new));
diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_api.h b/paddle/fluid/pir/dialect/operator/ir/manual_api.h
index 86d9b9a8245cc..7a89ae9eafaa8 100644
--- a/paddle/fluid/pir/dialect/operator/ir/manual_api.h
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_api.h
@@ -36,7 +36,7 @@ pir::Value parameter(const std::string& name);
 
 void set_parameter(const pir::Value& parameter, const std::string& name);
 
-void updata_parameter(const pir::Value& parameter, const std::string& name);
+void update_parameter(const pir::Value& parameter, const std::string& name);
 
 void shadow_output(const pir::Value& persist_value, const std::string& name);
 
diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_op_decomp_vjp.cc b/paddle/fluid/pir/dialect/operator/ir/manual_op_decomp_vjp.cc
index c1aa3d776b67e..7b15459837fd9 100644
--- a/paddle/fluid/pir/dialect/operator/ir/manual_op_decomp_vjp.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_op_decomp_vjp.cc
@@ -61,7 +61,7 @@ std::vector<std::vector<pir::Value>> StackGradOp::DecompVjp(
     auto stop_gradients_attr = op->attribute(kAttrStopGradients)
                                    .dyn_cast<pir::ArrayAttribute>()
                                    .AsVector();
-    for (size_t i = 0; i < stop_gradients[0].size(); ++i) {
+    for (size_t i = 0; i < stop_gradients_attr.size(); ++i) {
       stop_gradients[0].push_back(
           stop_gradients_attr[i].dyn_cast<pir::BoolAttribute>().data());
     }
@@ -144,24 +144,31 @@ std::vector<std::vector<pir::Value>> ConcatGradOp::DecompVjp(
                     .dyn_cast<paddle::dialect::ScalarAttribute>()
                     .data();
 
-  VLOG(6) << "Decomp call concat_grad's backward composite rule prepare";
+  VLOG(4) << "Decomp call concat_grad's backward composite rule prepare";
 
   std::vector<std::vector<bool>> stop_gradients(op->results().size());
-  if (combine_op_obj_x->HasAttribute(kAttrStopGradients)) {
-    auto stop_gradients_attr = op->attribute(kAttrStopGradients)
-                                   .dyn_cast<pir::ArrayAttribute>()
-                                   .AsVector();
-    for (size_t i = 0; i < stop_gradients[0].size(); ++i) {
-      stop_gradients[0].push_back(
-          stop_gradients_attr[i].dyn_cast<pir::BoolAttribute>().data());
+  auto splitop = op->results()[0].first_use().owner();
+
+  if (splitop->HasAttribute("current_bwd_op_stop_gradients")) {
+    auto stop_gradients_attr =
+        splitop->attribute("current_bwd_op_stop_gradients")
+            .dyn_cast<pir::ArrayAttribute>()
+            .AsVector();
+    for (size_t i = 0; i < stop_gradients_attr.size(); ++i) {
+      auto stop_gradients_attr_j =
+          stop_gradients_attr[i].dyn_cast<pir::ArrayAttribute>().AsVector();
+      for (size_t j = 0; j < stop_gradients_attr_j.size(); ++j) {
+        stop_gradients[0].push_back(
+            stop_gradients_attr_j[j].dyn_cast<pir::BoolAttribute>().data());
+      }
     }
 
-    VLOG(4) << " stop_gradients is set ";
+    VLOG(4) << " op stop_gradients is set ";
   } else {
     std::vector<bool> x_grad_stop_gradient(combine_op_obj_x.inputs().size(),
                                            false);
     stop_gradients[0] = x_grad_stop_gradient;
-    VLOG(4) << " stop_gradients is not set ";
+    VLOG(4) << " op stop_gradients is not set ";
   }
 
   std::vector<std::vector<paddle::Tensor>> tensor_res;
@@ -179,6 +186,7 @@ std::vector<std::vector<pir::Value>> ConcatGradOp::DecompVjp(
 
   paddle::primitive::details::concat_grad<primitive::LazyTensor>(
       x, out_grad, axis, x_grad);
+  VLOG(4) << "Call Pir Decomposed backward op concat_grad end";
   std::vector<std::vector<pir::Value>> res(tensor_res.size());
 
   for (size_t i = 0; i < tensor_res.size(); ++i) {
diff --git a/paddle/fluid/pir/dialect/operator/ir/op_attribute.cc b/paddle/fluid/pir/dialect/operator/ir/op_attribute.cc
index 8a843a8881734..4eb8190eaa111 100644
--- a/paddle/fluid/pir/dialect/operator/ir/op_attribute.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/op_attribute.cc
@@ -16,8 +16,7 @@
 #include "paddle/common/enforce.h"
 #include "paddle/common/errors.h"
 
-namespace paddle {
-namespace dialect {
+namespace paddle::dialect {
 const phi::IntArray &IntArrayAttribute::data() const {
   return storage()->GetAsKey();
 }
@@ -130,8 +129,7 @@ DataLayoutAttribute DataLayoutAttribute::Parse(
       parser.ctx, StringToDataLayoutMap().at(datalayout_token_val));
 }
 
-}  // namespace dialect
-}  // namespace paddle
+}  // namespace paddle::dialect
 
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::IntArrayAttribute)
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::ScalarAttribute)
diff --git a/paddle/fluid/pir/dialect/operator/transforms/param_to_variable.cc b/paddle/fluid/pir/dialect/operator/transforms/param_to_variable.cc
index 1d93e27c59b0b..78cb8e6460769 100644
--- a/paddle/fluid/pir/dialect/operator/transforms/param_to_variable.cc
+++ b/paddle/fluid/pir/dialect/operator/transforms/param_to_variable.cc
@@ -21,8 +21,7 @@
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/dense_tensor.h"
 
-namespace paddle {
-namespace dialect {
+namespace paddle::dialect {
 std::shared_ptr<paddle::framework::Variable>
 ParameterConvertInterface::ParameterToVariable(pir::Parameter *parameter) {
   if (parameter->type().isa<DenseTensorType>()) {
@@ -79,7 +78,6 @@ std::unique_ptr<pir::Parameter> ParameterConvertInterface::VariableToParameter(
   }
 }
 
-}  // namespace dialect
-}  // namespace paddle
+}  // namespace paddle::dialect
 
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::ParameterConvertInterface)
diff --git a/paddle/fluid/pir/drr/src/match_context_impl.h b/paddle/fluid/pir/drr/src/match_context_impl.h
index a9acb5f6ed8df..ce6911fb36ecb 100644
--- a/paddle/fluid/pir/drr/src/match_context_impl.h
+++ b/paddle/fluid/pir/drr/src/match_context_impl.h
@@ -17,6 +17,7 @@
 #include <memory>
 #include <unordered_map>
 
+#include "glog/logging.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
 #include "paddle/fluid/pir/drr/include/drr_pattern_context.h"
 #include "paddle/fluid/pir/drr/src/attr_type_uilts.h"
@@ -100,27 +101,32 @@ class MatchContextImpl final {
     tensor_map_.emplace(value_name, value);
   }
 
-  void BindIrOperation(const OpCall* op_call, pir::Operation* op) {
+  bool BindIrOperation(const OpCall* op_call, pir::Operation* op) {
     operation_map_.emplace(op_call, op);
     const auto& attrs = op_call->attributes();
     for (const auto& kv : attrs) {
-      std::visit(
+      bool bind_success = std::visit(
           [&](auto&& arg) {
             if constexpr (std::is_same_v<std::decay_t<decltype(arg)>,
                                          NormalAttribute>) {
-              PADDLE_ENFORCE(
-                  op->HasAttribute(kv.first),
-                  phi::errors::NotFound(
-                      "Not found attribute [%s] in Op [%s], please check the "
-                      "validity of the attribute name[%s].",
-                      kv.first,
-                      op->name(),
-                      kv.first));
-              BindIrAttr(arg.name(), op->attribute(kv.first));
+              if (op->HasAttribute(kv.first)) {
+                BindIrAttr(arg.name(), op->attribute(kv.first));
+                return true;
+              }
             }
+            return false;
           },
           kv.second);
+      if (!bind_success) {
+        LOG(WARNING) << "Not found attribute [" << kv.first << "] in Op ["
+                     << op->name()
+                     << "], please check the "
+                        "validity of the attribute name["
+                     << kv.first << "].";
+        return false;
+      }
     }
+    return true;
   }
 
  private:
diff --git a/paddle/fluid/pir/drr/src/rewrite_pattern.cc b/paddle/fluid/pir/drr/src/rewrite_pattern.cc
index 53b7ec0c919e9..93095af050afe 100644
--- a/paddle/fluid/pir/drr/src/rewrite_pattern.cc
+++ b/paddle/fluid/pir/drr/src/rewrite_pattern.cc
@@ -356,7 +356,10 @@ bool DrrRewritePattern::MatchFromOutputToInput(
       break;
     }
     // Step 1: Bind Operation of current op to match_ctx.
-    source_pattern_match_ctx->BindIrOperation(drr_node, ir_node);
+    if (!source_pattern_match_ctx->BindIrOperation(drr_node, ir_node)) {
+      matched = false;
+      break;
+    }
 
     // Step 2: Bind input_tensor of current op to match_ctx.
     const auto& drr_input_tensors = drr_node->inputs();
@@ -391,7 +394,7 @@ bool DrrRewritePattern::MatchFromOutputToInput(
           ir_input_values[i].use_count()) {
         matched = false;
         VLOG(8) << drr_node->name() << " Match failed: consumers of drr intput["
-                << i << "] { " << drr_node->outputs().size()
+                << i << "] { " << drr_input_tensors[i]->consumers().size()
                 << " } != consumers of pir intput[" << i << "] { "
                 << ir_input_values[i].use_count() << " }.";
         break;
@@ -495,8 +498,9 @@ MatchContextImpl DrrRewritePattern::CreateOperations(
     }
 
     // set insert point
-    size_t max_input_op_index = 0UL;
-    pir::Operation* max_index_op = nullptr;
+    // 1. get result pattern max-idx of input op
+    size_t max_res_idx = 0UL;
+    pir::Operation* max_res_idx_op = nullptr;
     for (const Tensor* input : op_call.inputs()) {
       if (input->is_none()) {
         continue;
@@ -506,18 +510,16 @@ MatchContextImpl DrrRewritePattern::CreateOperations(
         pir::Operation* ir_input_op = ir_val.defining_op();
         if (op_2_temp_program_index.count(ir_input_op) == 0) {
           // do nothing
-        } else if (max_input_op_index <
-                   op_2_temp_program_index.at(ir_input_op)) {
-          max_input_op_index = op_2_temp_program_index.at(ir_input_op);
-          max_index_op = ir_input_op;
-        } else if (max_input_op_index ==
-                   op_2_temp_program_index.at(ir_input_op)) {
-          const auto& ops_vec = temp_program[max_input_op_index];
+        } else if (max_res_idx < op_2_temp_program_index.at(ir_input_op)) {
+          max_res_idx = op_2_temp_program_index.at(ir_input_op);
+          max_res_idx_op = ir_input_op;
+        } else if (max_res_idx == op_2_temp_program_index.at(ir_input_op)) {
+          const auto& ops_vec = temp_program[max_res_idx];
           for (auto it = ops_vec.begin(); it != ops_vec.end(); it++) {
-            if (*it == max_index_op) {
+            if (*it == max_res_idx_op) {
               break;
             } else if (*it == ir_input_op) {
-              max_index_op = ir_input_op;
+              max_res_idx_op = ir_input_op;
               break;
             } else {
               // do nothing
@@ -528,25 +530,29 @@ MatchContextImpl DrrRewritePattern::CreateOperations(
         }
       }
     }
-    if (max_input_op_index == 0UL) {
-      VLOG(6) << "Not found producer op for (" << op_call.name() << ")";
-      pir::Operation* source_pattern_first_op = src_match_ctx.IrOperation(
-          source_pattern_graph.owned_op_call()[0].get());
-      max_input_op_index = op_2_temp_program_index[source_pattern_first_op];
-      rewriter.set_insertion_point(source_pattern_first_op);
-    } else {
-      rewriter.SetInsertionPointAfter(max_index_op);
-    }
 
-    pir::Operation* new_op =
-        CreateOperation(op_call, src_match_ctx, rewriter, &res_match_ctx);
+    // 2. get source pattern min-idx op
+    pir::Operation* min_src_idx_op = src_match_ctx.IrOperation(
+        source_pattern_graph.owned_op_call()[0].get());
+    size_t min_src_idx = op_2_temp_program_index[min_src_idx_op];
+    for (const auto& src_owned_op_call : source_pattern_graph.owned_op_call()) {
+      pir::Operation* src_owned_op =
+          src_match_ctx.IrOperation(src_owned_op_call.get());
+      size_t src_owned_op_idx = op_2_temp_program_index[src_owned_op];
+      if (min_src_idx > src_owned_op_idx) {
+        min_src_idx = src_owned_op_idx;
+        min_src_idx_op = src_owned_op;
+      }
+    }
 
-    size_t new_max_input_op_index = max_input_op_index + 1;
-    op_2_temp_program_index[new_op] = new_max_input_op_index;
-    if (new_max_input_op_index >= temp_program.size()) {
-      temp_program.emplace_back();
+    // 3. insert new op at point max(max_res_idx+1, min_src_idx)
+    if (min_src_idx > max_res_idx) {
+      rewriter.set_insertion_point(min_src_idx_op);
+    } else {
+      rewriter.SetInsertionPointAfter(max_res_idx_op);
     }
-    temp_program[new_max_input_op_index].push_back(new_op);
+
+    CreateOperation(op_call, src_match_ctx, rewriter, &res_match_ctx);
   });
 
   return res_match_ctx;
diff --git a/paddle/fluid/pir/transforms/general/transfer_layout_pass.cc b/paddle/fluid/pir/transforms/general/transfer_layout_pass.cc
index fcbfcbb910e1e..61113f8e9dfc5 100644
--- a/paddle/fluid/pir/transforms/general/transfer_layout_pass.cc
+++ b/paddle/fluid/pir/transforms/general/transfer_layout_pass.cc
@@ -278,7 +278,7 @@ struct FlowGraph {
       }
     }
 
-    std::unordered_set<Node> nhwc_nodes;
+    std::unordered_set<Node> mutable_nodes;
     for (auto& op : *(program.block())) {
       auto layout_transform_iface =
           op.dyn_cast<paddle::dialect::LayoutTransformationInterface>();
@@ -286,10 +286,14 @@ struct FlowGraph {
         continue;
       }
 
+      if (!layout_transform_iface.CanBeModified(&op)) {
+        continue;
+      }
+
       auto prefer_layout = layout_transform_iface.PreferLayout(&op);
       if (prefer_layout == common::DataLayout::NHWC) {
         Node op_node(&op);
-        nhwc_nodes.insert(op_node);
+        mutable_nodes.insert(op_node);
         AddEdge(op_node, dst_node(), INF);
         VLOG(10) << "[PreProcess] node: " << op_node
                  << " should be set to NHWC";
@@ -302,7 +306,7 @@ struct FlowGraph {
     // operation who have a dertermined layout and spread its layout to
     // its output and inputs recursively.
     std::queue<Node> q;
-    for (auto& n : nhwc_nodes) {
+    for (auto& n : mutable_nodes) {
       q.push(n);
     }
     std::unordered_set<Node> is_node_layout_visited;
@@ -362,13 +366,14 @@ struct FlowGraph {
                   // a point of cut edge. So we set its outputs and inputs to
                   // immutable.
                   Node in_node = Node(v.defining_op());
-                  nhwc_nodes.erase(in_node);
-                  VLOG(10) << "erase node: " << in_node << " from nhwc set";
+                  mutable_nodes.erase(in_node);
+                  VLOG(10) << "erase node: " << in_node << " from mutable set";
 
                   for (auto it = v.use_begin(); it != v.use_end(); ++it) {
                     Node out_node(it->owner());
-                    nhwc_nodes.erase(out_node);
-                    VLOG(10) << "erase node: " << out_node << " from nhwc set";
+                    mutable_nodes.erase(out_node);
+                    VLOG(10)
+                        << "erase node: " << out_node << " from mutable set";
                   }
                 }
                 return !can_be_transformed;
@@ -380,8 +385,8 @@ struct FlowGraph {
         continue;
       }
 
-      VLOG(10) << "add node to nhwc set: " << node;
-      nhwc_nodes.insert(node);
+      VLOG(10) << "add node to mutable set: " << node;
+      mutable_nodes.insert(node);
 
       VLOG(10) << "processing node successor: " << node;
 
@@ -403,7 +408,7 @@ struct FlowGraph {
         continue;
       }
       is_node_layout_visited.insert(node);
-      if (nhwc_nodes.count(node) == 0) {
+      if (mutable_nodes.count(node) == 0) {
         VLOG(10) << "add node to nchw set: " << node;
         AddEdge(src_node(), node, INF);
       }
@@ -542,7 +547,7 @@ using Edge = FlowGraph::Edge;
 
 class TransferLayoutPass : public pir::Pass {
  public:
-  TransferLayoutPass() : pir::Pass("transfer_layout_pass", 3) {}
+  TransferLayoutPass() : pir::Pass("transfer_layout_pass", 2) {}
 
   bool CanApplyOn(pir::Operation* op) const override {
     if (!op->isa<pir::ModuleOp>()) {
diff --git a/paddle/fluid/pir/transforms/gpu/add_norm_fuse_pass.cc b/paddle/fluid/pir/transforms/gpu/add_norm_fuse_pass.cc
index 35afabe3ad1dc..f8675afec6c57 100644
--- a/paddle/fluid/pir/transforms/gpu/add_norm_fuse_pass.cc
+++ b/paddle/fluid/pir/transforms/gpu/add_norm_fuse_pass.cc
@@ -141,11 +141,13 @@ class RmsNormFusePattern : public paddle::drr::DrrPatternBase {
 class AddRmsNormFusePattern : public paddle::drr::DrrPatternBase {
  private:
   const bool extra_add_;
+  const bool trans_extra_add_;
 
  public:
-  explicit AddRmsNormFusePattern(bool extra_add) : extra_add_(extra_add) {}
+  AddRmsNormFusePattern(bool extra_add, bool trans_extra_add)
+      : extra_add_(extra_add), trans_extra_add_{trans_extra_add} {}
 
-  uint32_t benefit() const override { return extra_add_ ? 2 : 1; }
+  uint32_t benefit() const override { return extra_add_ ? 4 : 3; }
 
   std::string name() const override { return "AddRmsNormFusePattern"; }
 
@@ -176,7 +178,9 @@ class AddRmsNormFusePattern : public paddle::drr::DrrPatternBase {
     if (extra_add_) {
       const auto &add1 = pat.Op(paddle::dialect::AddOp::name());
       pat.Tensor("add_out1") =
-          add1(pat.Tensor("add_out"), pat.Tensor("any_tensor"));
+          trans_extra_add_
+              ? add1(pat.Tensor("any_tensor"), pat.Tensor("add_out"))
+              : add1(pat.Tensor("add_out"), pat.Tensor("any_tensor"));
     }
     paddle::drr::ResultPattern res = pat.ResultPattern();
     const auto &res_rms_norm =
@@ -207,11 +211,13 @@ class AddRmsNormFusePattern : public paddle::drr::DrrPatternBase {
 class AddLayerNormFusePattern : public paddle::drr::DrrPatternBase {
  private:
   const bool extra_add_;
+  const bool trans_extra_add_;
 
  public:
-  explicit AddLayerNormFusePattern(bool extra_add) : extra_add_(extra_add) {}
+  AddLayerNormFusePattern(bool extra_add, bool trans_extra_add)
+      : extra_add_(extra_add), trans_extra_add_{trans_extra_add} {}
 
-  uint32_t benefit() const override { return extra_add_ ? 2 : 1; }
+  uint32_t benefit() const override { return extra_add_ ? 4 : 3; }
   std::string name() const override { return "AddLayerNormFusePattern"; }
 
   void operator()(paddle::drr::DrrPatternContext *ctx) const override {
@@ -231,22 +237,20 @@ class AddLayerNormFusePattern : public paddle::drr::DrrPatternBase {
     if (extra_add_) {
       const auto &add1 = pat.Op(paddle::dialect::AddOp::name());
       pat.Tensor("add_out1") =
-          add1(pat.Tensor("add_out"), pat.Tensor("any_tensor"));
+          trans_extra_add_
+              ? add1(pat.Tensor("any_tensor"), pat.Tensor("add_out"))
+              : add1(pat.Tensor("add_out"), pat.Tensor("any_tensor"));
     }
 
     paddle::drr::ResultPattern res = pat.ResultPattern();
     const auto &cast_op_dtype = res.ComputeAttr(
         [](const paddle::drr::MatchContext &match_ctx) -> phi::DataType {
-          auto x_dtype = pir::GetDataTypeFromValue(match_ctx.Tensor("x"));
-          return paddle::dialect::TransToPhiDataType(x_dtype);
+          return phi::DataType::FLOAT32;
         });
-    const auto &cast_op_1 =
+    const auto cast_1_op =
         res.Op(paddle::dialect::CastOp::name(), {{"dtype", cast_op_dtype}});
-    res.Tensor("casted_bias") = cast_op_1(res.Tensor("bias"));
-    const auto &cast_op_2 =
+    const auto cast_2_op =
         res.Op(paddle::dialect::CastOp::name(), {{"dtype", cast_op_dtype}});
-    res.Tensor("casted_w") = cast_op_2(res.Tensor("w"));
-
     const auto &fuse_layer_norm =
         res.Op(paddle::dialect::FusedBiasResidualLayernormOp::name(),
                {{"epsilon", pat.Attr("epsilon")},
@@ -256,14 +260,15 @@ class AddLayerNormFusePattern : public paddle::drr::DrrPatternBase {
                 {"quant_round_type", res.Int32Attr(0)},
                 {"quant_max_bound", res.Float32Attr(0.0)},
                 {"quant_min_bound", res.Float32Attr(0.0)}});
-
+    res.Tensor("w_cast") = cast_1_op(res.Tensor("w"));
+    res.Tensor("bias_cast") = cast_1_op(res.Tensor("bias"));
     fuse_layer_norm(
         {
             &res.Tensor("x"),
-            &res.Tensor("casted_bias"),
-            &res.Tensor("residual"),
-            &res.Tensor("casted_w"),
             &res.InputNoneTensor(),
+            &res.Tensor("residual"),
+            &res.Tensor("w_cast"),
+            &res.Tensor("bias_cast"),
         },
         {&res.Tensor("layer_norm_out"),
          &res.Tensor("add_out"),
@@ -272,6 +277,163 @@ class AddLayerNormFusePattern : public paddle::drr::DrrPatternBase {
   }
 };
 
+class AddGroupNormFusePattern : public paddle::drr::DrrPatternBase {
+ private:
+  const bool extra_add_;
+  const bool trans_extra_add_;
+
+ public:
+  AddGroupNormFusePattern(bool extra_add, bool trans_extra_add)
+      : extra_add_(extra_add), trans_extra_add_{trans_extra_add} {}
+
+  uint32_t benefit() const override { return extra_add_ ? 4 : 3; }
+  std::string name() const override { return "AddGroupNormFusePattern"; }
+
+  void operator()(paddle::drr::DrrPatternContext *ctx) const override {
+    paddle::drr::SourcePattern pat = ctx->SourcePattern();
+    const auto &add = pat.Op(paddle::dialect::AddOp::name());
+    const auto &group_norm = pat.Op(paddle::dialect::GroupNormOp::name(),
+                                    {{"epsilon", pat.Attr("epsilon")},
+                                     {"groups", pat.Attr("groups")},
+                                     {"data_format", pat.Attr("data_format")}});
+    pat.Tensor("add_out") = add(pat.Tensor("x"), pat.Tensor("residual"));
+    group_norm(
+        {&pat.Tensor("add_out"), &pat.Tensor("scale"), &pat.Tensor("bias")},
+        {&pat.Tensor("group_out"),
+         &pat.Tensor("mean_out_0"),
+         &pat.Tensor("variance_out_0")});
+    // TODO(bukejiyu) :DRR support matching placeholder op,
+    // the following needs to be deleted
+    if (extra_add_) {
+      const auto &add1 = pat.Op(paddle::dialect::AddOp::name());
+      pat.Tensor("add_out1") =
+          trans_extra_add_
+              ? add1(pat.Tensor("any_tensor"), pat.Tensor("add_out"))
+              : add1(pat.Tensor("add_out"), pat.Tensor("any_tensor"));
+    }
+    pat.AddConstraint([this](const paddle::drr::MatchContext &match_ctx) {
+      auto x_dtype = pir::GetDataTypeFromValue(match_ctx.Tensor("x"));
+      if (!x_dtype.isa<pir::Float16Type>() &&
+          !x_dtype.isa<pir::BFloat16Type>()) {
+        return false;
+      }
+      return true;
+    });
+    paddle::drr::ResultPattern res = pat.ResultPattern();
+    const auto &add_group_norm_silu_op =
+        res.Op(paddle::dialect::AddGroupNormSiluOp::name(),
+               {{"epsilon", pat.Attr("epsilon")},
+                {"groups", pat.Attr("groups")},
+                {"data_format", pat.Attr("data_format")},
+                {"activation", res.StrAttr("")}});
+
+    add_group_norm_silu_op({&res.Tensor("x"),
+                            &res.Tensor("residual"),
+                            &res.Tensor("scale"),
+                            &res.Tensor("bias")},
+                           {&res.Tensor("group_out"),
+                            &res.Tensor("add_out"),
+                            &res.Tensor("mean_out"),
+                            &res.Tensor("variance_out")});
+  }
+};
+
+class AddGroupNormWithActPattern : public paddle::drr::DrrPatternBase {
+ public:
+  uint32_t benefit() const override { return 2; }
+  std::string name() const override { return "AddGroupNormWithActPattern"; }
+
+  void operator()(paddle::drr::DrrPatternContext *ctx) const override {
+    paddle::drr::SourcePattern pat = ctx->SourcePattern();
+    const auto &add_group_norm_silu_op =
+        pat.Op(paddle::dialect::AddGroupNormSiluOp::name(),
+               {{"epsilon", pat.Attr("epsilon")},
+                {"groups", pat.Attr("groups")},
+                {"data_format", pat.Attr("data_format")},
+                {"activation", pat.Attr("activation")}});
+    const auto &silu = pat.Op(paddle::dialect::SiluOp::name());
+    add_group_norm_silu_op({&pat.Tensor("x"),
+                            &pat.Tensor("residual"),
+                            &pat.Tensor("scale"),
+                            &pat.Tensor("bias")},
+                           {&pat.Tensor("group_out"),
+                            &pat.Tensor("add_out"),
+                            &pat.Tensor("mean_out_0"),
+                            &pat.Tensor("variance_out_0")});
+    pat.Tensor("silu_out") = silu(pat.Tensor("group_out"));
+    pat.AddConstraint([this](const paddle::drr::MatchContext &match_ctx) {
+      auto x_dtype = pir::GetDataTypeFromValue(match_ctx.Tensor("x"));
+      if (!x_dtype.isa<pir::Float16Type>() &&
+          !x_dtype.isa<pir::BFloat16Type>()) {
+        return false;
+      }
+      auto activation = match_ctx.Attr<std::string>("activation");
+      if (activation != "") {
+        return false;
+      }
+      return true;
+    });
+    paddle::drr::ResultPattern res = pat.ResultPattern();
+    const auto &res_add_group_norm_silu_op =
+        res.Op(paddle::dialect::AddGroupNormSiluOp::name(),
+               {{"epsilon", pat.Attr("epsilon")},
+                {"groups", pat.Attr("groups")},
+                {"data_format", pat.Attr("data_format")},
+                {"activation", res.StrAttr("silu")}});
+    res_add_group_norm_silu_op({&res.Tensor("x"),
+                                &res.Tensor("residual"),
+                                &res.Tensor("scale"),
+                                &res.Tensor("bias")},
+                               {&res.Tensor("silu_out"),
+                                &res.Tensor("add_out"),
+                                &res.Tensor("mean_out"),
+                                &res.Tensor("variance_out")});
+  }
+};
+
+class GroupNormWithActPattern : public paddle::drr::DrrPatternBase {
+ public:
+  uint32_t benefit() const override { return 1; }
+  std::string name() const override { return "GroupNormWithActPattern"; }
+
+  void operator()(paddle::drr::DrrPatternContext *ctx) const override {
+    paddle::drr::SourcePattern pat = ctx->SourcePattern();
+    const auto &group_norm = pat.Op(paddle::dialect::GroupNormOp::name(),
+                                    {{"epsilon", pat.Attr("epsilon")},
+                                     {"groups", pat.Attr("groups")},
+                                     {"data_format", pat.Attr("data_format")}});
+    const auto &silu = pat.Op(paddle::dialect::SiluOp::name());
+    group_norm({&pat.Tensor("x"), &pat.Tensor("scale"), &pat.Tensor("bias")},
+               {&pat.Tensor("group_out"),
+                &pat.Tensor("mean_out_0"),
+                &pat.Tensor("variance_out_0")});
+    pat.Tensor("silu_out") = silu(pat.Tensor("group_out"));
+    pat.AddConstraint([this](const paddle::drr::MatchContext &match_ctx) {
+      auto x_dtype = pir::GetDataTypeFromValue(match_ctx.Tensor("x"));
+      if (!x_dtype.isa<pir::Float16Type>() &&
+          !x_dtype.isa<pir::BFloat16Type>()) {
+        return false;
+      }
+      return true;
+    });
+    paddle::drr::ResultPattern res = pat.ResultPattern();
+    const auto &add_group_norm_silu_op =
+        res.Op(paddle::dialect::AddGroupNormSiluOp::name(),
+               {{"epsilon", pat.Attr("epsilon")},
+                {"groups", pat.Attr("groups")},
+                {"data_format", pat.Attr("data_format")},
+                {"activation", res.StrAttr("silu")}});
+    add_group_norm_silu_op({&res.Tensor("x"),
+                            &res.InputNoneTensor(),
+                            &res.Tensor("scale"),
+                            &res.Tensor("bias")},
+                           {&res.Tensor("silu_out"),
+                            &res.OutputNoneTensor(),
+                            &res.Tensor("mean_out"),
+                            &res.Tensor("variance_out")});
+  }
+};
+
 class AddNormFusePass : public pir::PatternRewritePass {
  public:
   AddNormFusePass() : pir::PatternRewritePass("add_norm_fuse_pass", 2) {}
@@ -290,13 +452,37 @@ class AddNormFusePass : public pir::PatternRewritePass {
     // x--------
     //           add-rms_norm ---> rms_norm
     // residual-
-    ps.Add(paddle::drr::Create<AddRmsNormFusePattern>(context, !extra_add));
-    ps.Add(paddle::drr::Create<AddRmsNormFusePattern>(context, extra_add));
+    ps.Add(
+        paddle::drr::Create<AddRmsNormFusePattern>(context, !extra_add, false));
+    ps.Add(
+        paddle::drr::Create<AddRmsNormFusePattern>(context, extra_add, true));
+    ps.Add(
+        paddle::drr::Create<AddRmsNormFusePattern>(context, extra_add, false));
+
     // x--------
     //           add-layer_norm ----> fused_bias_residual_layernorm
     // residual-
-    ps.Add(paddle::drr::Create<AddLayerNormFusePattern>(context, !extra_add));
-    ps.Add(paddle::drr::Create<AddLayerNormFusePattern>(context, extra_add));
+    ps.Add(paddle::drr::Create<AddLayerNormFusePattern>(
+        context, !extra_add, false));
+    ps.Add(
+        paddle::drr::Create<AddLayerNormFusePattern>(context, extra_add, true));
+    ps.Add(paddle::drr::Create<AddLayerNormFusePattern>(
+        context, extra_add, false));
+
+    // x--------
+    //           add-group_norm ----> add_group_norm_silu
+    // residual-
+    ps.Add(paddle::drr::Create<AddGroupNormFusePattern>(
+        context, !extra_add, true));
+    ps.Add(
+        paddle::drr::Create<AddGroupNormFusePattern>(context, extra_add, true));
+    ps.Add(paddle::drr::Create<AddGroupNormFusePattern>(
+        context, extra_add, false));
+
+    // add_group_norm_silu-silu --->add_group_norm_silu
+    ps.Add(paddle::drr::Create<AddGroupNormWithActPattern>(context));
+    // group-silu->add_group_norm_silu
+    ps.Add(paddle::drr::Create<GroupNormWithActPattern>(context));
     return ps;
   }
 };
diff --git a/paddle/fluid/pir/transforms/gpu/conv2d_add_act_fuse_pass.cc b/paddle/fluid/pir/transforms/gpu/conv2d_add_act_fuse_pass.cc
index 96851cfeac559..754422312e47a 100644
--- a/paddle/fluid/pir/transforms/gpu/conv2d_add_act_fuse_pass.cc
+++ b/paddle/fluid/pir/transforms/gpu/conv2d_add_act_fuse_pass.cc
@@ -35,8 +35,8 @@ class Conv2dAddActFusePassDrrPattern : public paddle::drr::DrrPatternBase {
  private:
   std::string act_name_;
   bool cutlass_pattern_;
-  const std::unordered_set<std::string> conv2d_depthwise_act_set_ = {
-      "relu", "swish", "sigmoid"};
+  const std::unordered_set<std::string> conv2d_depthwise_act_set_ = {"relu",
+                                                                     "swish"};
 
  public:
   static const int CUTLASS_NHWC_ALIGNMENT = 8;
@@ -152,62 +152,6 @@ class Conv2dAddActFusePassDrrPattern : public paddle::drr::DrrPatternBase {
         [this](const paddle::drr::MatchContext &match_ctx) -> std::string {
           return cutlass_pattern_ ? "gpu" : "gpudnn";
         });
-    const auto &perm_weight_shape = res.ComputeAttr(
-        [this](const paddle::drr::MatchContext &match_ctx) -> std::vector<int> {
-          auto data_format = match_ctx.Attr<std::string>("data_format");
-          if (cutlass_pattern_ || data_format == "NHWC") {
-            return {0, 2, 3, 1};
-          } else {
-            return {0, 1, 2, 3};
-          }
-        });
-    const auto &perm_input_shape = res.ComputeAttr(
-        [this](const paddle::drr::MatchContext &match_ctx) -> std::vector<int> {
-          auto data_format = match_ctx.Attr<std::string>("data_format");
-          if (cutlass_pattern_ && data_format == "NCHW") {
-            return {0, 2, 3, 1};
-          } else {
-            return {0, 1, 2, 3};
-          }
-        });
-    const auto &perm_bias_shape = res.ComputeAttr(
-        [this](const paddle::drr::MatchContext &match_ctx) -> std::vector<int> {
-          auto data_format = match_ctx.Attr<std::string>("data_format");
-          auto bias_shape = pir::GetShapeFromValue(match_ctx.Tensor("bias"));
-          if (cutlass_pattern_ && data_format == "NCHW") {
-            if (bias_shape.size() == 4) {
-              return {0, 2, 3, 1};
-            } else if (bias_shape.size() == 3) {
-              return {0, 2, 1};
-            } else {
-              return {0};
-            }
-          } else {
-            std::vector<int> dst_vector(bias_shape.size());
-            std::iota(dst_vector.begin(), dst_vector.end(), 0);
-            return dst_vector;
-          }
-        });
-    const auto &data_format_conv = res.ComputeAttr(
-        [this](const paddle::drr::MatchContext &match_ctx) -> std::string {
-          auto data_format = match_ctx.Attr<std::string>("data_format");
-          if (cutlass_pattern_ && data_format == "NCHW") {
-            return "NHWC";
-          } else {
-            return data_format;
-          }
-        });
-    // TODO(bukejiyu) When the transfer_layout_pass is supported,
-    // transpose_op will be deleted.
-    const auto &transpose_op_w = res.Op(paddle::dialect::TransposeOp::name(),
-                                        {{"perm", perm_weight_shape}});
-    const auto &transpose_op_input = res.Op(
-        paddle::dialect::TransposeOp::name(), {{"perm", perm_input_shape}});
-    const auto &transpose_op_bias = res.Op(paddle::dialect::TransposeOp::name(),
-                                           {{"perm", perm_bias_shape}});
-    res.Tensor("filter_transpose") = transpose_op_w(res.Tensor("filter"));
-    res.Tensor("input_transpose") = transpose_op_input(res.Tensor("input"));
-    res.Tensor("bias_transpose") = transpose_op_bias(res.Tensor("bias"));
     const auto &fused_conv2d_add_act = res.Op(
         paddle::dialect::FusedConv2dAddActOp::name(),
         {{
@@ -216,7 +160,7 @@ class Conv2dAddActFusePassDrrPattern : public paddle::drr::DrrPatternBase {
             {"padding_algorithm", pat.Attr("padding_algorithm")},
             {"dilations", pat.Attr("dilations")},
             {"groups", pat.Attr("groups")},
-            {"data_format", data_format_conv},
+            {"data_format", pat.Attr("data_format")},
             {"activation", res.StrAttr(act_name_)},
             {"split_channels", res.VectorInt32Attr({})},
             {"exhaustive_search", res.BoolAttr(false)},
@@ -224,24 +168,11 @@ class Conv2dAddActFusePassDrrPattern : public paddle::drr::DrrPatternBase {
             {"fuse_alpha", res.Float32Attr(0.0f)},
         }},
         {{{paddle::dialect::kForceBackendAttr, force_backend_runtime_attr}}});
-    fused_conv2d_add_act({&res.Tensor("input_transpose"),
-                          &res.Tensor("filter_transpose"),
-                          &res.Tensor("bias_transpose"),
+    fused_conv2d_add_act({&res.Tensor("input"),
+                          &res.Tensor("filter"),
+                          &res.Tensor("bias"),
                           &res.InputNoneTensor()},
-                         {&res.Tensor("fuesd_conv2d_add_act_out")});
-    const auto &perm_out_shape = res.ComputeAttr(
-        [this](const paddle::drr::MatchContext &match_ctx) -> std::vector<int> {
-          auto data_format = match_ctx.Attr<std::string>("data_format");
-          if (cutlass_pattern_ && data_format == "NCHW") {
-            return {0, 3, 1, 2};
-          } else {
-            return {0, 1, 2, 3};
-          }
-        });
-    const auto &transpose_op_out = res.Op(paddle::dialect::TransposeOp::name(),
-                                          {{"perm", perm_out_shape}});
-    res.Tensor("act_out") =
-        transpose_op_out(res.Tensor("fuesd_conv2d_add_act_out"));
+                         {&res.Tensor("act_out")});
   }
 };
 
@@ -278,11 +209,9 @@ class Conv2dAdd2ActFusePattern
     if (next_op->isa<paddle::dialect::ReluOp>()) {
       act_name = "relu";
     }
-#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 8000 && CUDNN_VERSION < 8700
+#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION >= 8000 && CUDNN_VERSION < 8700
     if (next_op->isa<paddle::dialect::TanhOp>()) {
       act_name = "tanh";
-    } else if (next_op->isa<paddle::dialect::SigmoidOp>()) {
-      act_name = "sigmoid";
     }
 #endif
     if (act_name == "") {
@@ -346,11 +275,10 @@ class Conv2dAddActFusePass : public pir::PatternRewritePass {
                 paddle::dialect::FusedConv2dAddActOp::name()});
 
 // NOTE(liuyuanle): cudnn [8.7, 8.9 now) version has bug when act is
-// sigmoid/tanh. Ref to issue
+// tanh. Ref to issue
 // https://github.com/PaddlePaddle/Paddle/issues/50853
 #if CUDNN_VERSION >= 8000 && CUDNN_VERSION < 8700
-    const std::unordered_set<std::string> cudnn_act_set(
-        {"relu", "sigmoid", "tanh"});
+    const std::unordered_set<std::string> cudnn_act_set({"relu", "tanh"});
 #else
     const std::unordered_set<std::string> cudnn_act_set({"relu"});
 #endif
diff --git a/paddle/fluid/pir/transforms/gpu/conv2d_add_fuse_pass.cc b/paddle/fluid/pir/transforms/gpu/conv2d_add_fuse_pass.cc
index 994fbdf2ce69f..89a023197a27e 100644
--- a/paddle/fluid/pir/transforms/gpu/conv2d_add_fuse_pass.cc
+++ b/paddle/fluid/pir/transforms/gpu/conv2d_add_fuse_pass.cc
@@ -138,62 +138,6 @@ class Conv2dAddFusePattern : public paddle::drr::DrrPatternBase {
         [this](const paddle::drr::MatchContext &match_ctx) -> std::string {
           return cutlass_pattern_ ? "gpu" : "gpudnn";
         });
-    const auto &perm_weight_shape = res.ComputeAttr(
-        [this](const paddle::drr::MatchContext &match_ctx) -> std::vector<int> {
-          auto data_format = match_ctx.Attr<std::string>("data_format");
-          if (cutlass_pattern_ || data_format == "NHWC") {
-            return {0, 2, 3, 1};
-          } else {
-            return {0, 1, 2, 3};
-          }
-        });
-    const auto &perm_input_shape = res.ComputeAttr(
-        [this](const paddle::drr::MatchContext &match_ctx) -> std::vector<int> {
-          auto data_format = match_ctx.Attr<std::string>("data_format");
-          if (cutlass_pattern_ && data_format == "NCHW") {
-            return {0, 2, 3, 1};
-          } else {
-            return {0, 1, 2, 3};
-          }
-        });
-    const auto &perm_bias_shape = res.ComputeAttr(
-        [this](const paddle::drr::MatchContext &match_ctx) -> std::vector<int> {
-          auto data_format = match_ctx.Attr<std::string>("data_format");
-          auto bias_shape = pir::GetShapeFromValue(match_ctx.Tensor("bias"));
-          if (cutlass_pattern_ && data_format == "NCHW") {
-            if (bias_shape.size() == 4) {
-              return {0, 2, 3, 1};
-            } else if (bias_shape.size() == 3) {
-              return {0, 2, 1};
-            } else {
-              return {0};
-            }
-          } else {
-            std::vector<int> dst_vector(bias_shape.size());
-            std::iota(dst_vector.begin(), dst_vector.end(), 0);
-            return dst_vector;
-          }
-        });
-    const auto &data_format_conv = res.ComputeAttr(
-        [this](const paddle::drr::MatchContext &match_ctx) -> std::string {
-          auto data_format = match_ctx.Attr<std::string>("data_format");
-          if (cutlass_pattern_ && data_format == "NCHW") {
-            return "NHWC";
-          } else {
-            return data_format;
-          }
-        });
-    // TODO(bukejiyu) When the transfer_layout_pass is supported,
-    // transpose_op will be deleted.
-    const auto &transpose_op_w = res.Op(paddle::dialect::TransposeOp::name(),
-                                        {{"perm", perm_weight_shape}});
-    const auto &transpose_op_input = res.Op(
-        paddle::dialect::TransposeOp::name(), {{"perm", perm_input_shape}});
-    const auto &transpose_op_bias = res.Op(paddle::dialect::TransposeOp::name(),
-                                           {{"perm", perm_bias_shape}});
-    res.Tensor("filter_transpose") = transpose_op_w(res.Tensor("filter"));
-    res.Tensor("input_transpose") = transpose_op_input(res.Tensor("input"));
-    res.Tensor("bias_transpose") = transpose_op_bias(res.Tensor("bias"));
     const auto &fused_conv2d_add_act = res.Op(
         paddle::dialect::FusedConv2dAddActOp::name(),
         {{
@@ -202,7 +146,7 @@ class Conv2dAddFusePattern : public paddle::drr::DrrPatternBase {
             {"padding_algorithm", pat.Attr("padding_algorithm")},
             {"dilations", pat.Attr("dilations")},
             {"groups", pat.Attr("groups")},
-            {"data_format", data_format_conv},
+            {"data_format", pat.Attr("data_format")},
             {"activation", res.StrAttr("identity")},
             {"split_channels", res.VectorInt32Attr({})},
             {"exhaustive_search", res.BoolAttr(false)},
@@ -211,25 +155,11 @@ class Conv2dAddFusePattern : public paddle::drr::DrrPatternBase {
         }},
         {{{paddle::dialect::kForceBackendAttr, force_backend_runtime_attr}}});
 
-    fused_conv2d_add_act(
-        {&res.Tensor("input_transpose"),
-         &res.Tensor("filter_transpose"),
-         &res.Tensor("bias_transpose"),
-         &res.InputNoneTensor()},
-        {&res.Tensor("fuesd_conv2d_add_act_out"), &res.OutputNoneTensor()});
-    const auto &perm_out_shape = res.ComputeAttr(
-        [this](const paddle::drr::MatchContext &match_ctx) -> std::vector<int> {
-          auto data_format = match_ctx.Attr<std::string>("data_format");
-          if (cutlass_pattern_ && data_format == "NCHW") {
-            return {0, 3, 1, 2};
-          } else {
-            return {0, 1, 2, 3};
-          }
-        });
-    const auto &transpose_op_out = res.Op(paddle::dialect::TransposeOp::name(),
-                                          {{"perm", perm_out_shape}});
-    res.Tensor("add_out") =
-        transpose_op_out(res.Tensor("fuesd_conv2d_add_act_out"));
+    fused_conv2d_add_act({&res.Tensor("input"),
+                          &res.Tensor("filter"),
+                          &res.Tensor("bias"),
+                          &res.InputNoneTensor()},
+                         {&res.Tensor("add_out"), &res.OutputNoneTensor()});
   }
 };
 
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc
index 358d52d03d31b..d7b164862cd7e 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -68,8 +68,7 @@ PADDLE_DEFINE_EXPORTED_uint64(cuda_memory_async_pool_realease_threshold,
                               "Amount of reserved memory in bytes to hold onto "
                               "before trying to release memory back to the OS");
 
-namespace paddle {
-namespace platform {
+namespace paddle::platform {
 
 void GpuMemoryUsage(size_t *available, size_t *total) {
   size_t actual_available, actual_total;
@@ -719,5 +718,4 @@ void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) {
   phi::backends::gpu::GpuMemsetAsync(dst, value, count, stream);
 }
 
-}  // namespace platform
-}  // namespace paddle
+}  // namespace paddle::platform
diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h
index 496b253dff5b3..980b7cb35410b 100644
--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
@@ -80,8 +80,14 @@ namespace dynload {
   __macro(cublasSgetriBatched);           \
   __macro(cublasDgetrfBatched);           \
   __macro(cublasDgetriBatched);           \
+  __macro(cublasCgetrfBatched);           \
+  __macro(cublasCgetriBatched);           \
+  __macro(cublasZgetrfBatched);           \
+  __macro(cublasZgetriBatched);           \
   __macro(cublasSmatinvBatched);          \
   __macro(cublasDmatinvBatched);          \
+  __macro(cublasCmatinvBatched);          \
+  __macro(cublasZmatinvBatched);          \
   __macro(cublasSgetrsBatched);           \
   __macro(cublasDgetrsBatched);
 
diff --git a/paddle/fluid/platform/dynload/nccl.cc b/paddle/fluid/platform/dynload/nccl.cc
index 7b0ea3bb7f3c1..ee270918b59c7 100644
--- a/paddle/fluid/platform/dynload/nccl.cc
+++ b/paddle/fluid/platform/dynload/nccl.cc
@@ -14,9 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/dynload/nccl.h"
 
-namespace paddle {
-namespace platform {
-namespace dynload {
+namespace paddle::platform::dynload {
 
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
@@ -38,6 +36,4 @@ NCCL_RAND_ROUTINE_EACH_AFTER_2703(DEFINE_WRAP)
 NCCL_RAND_ROUTINE_EACH_AFTER_21100(DEFINE_WRAP)
 #endif
 
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
+}  // namespace paddle::platform::dynload
diff --git a/paddle/fluid/prim/api/api.yaml b/paddle/fluid/prim/api/api.yaml
index a951ed4431a57..61e056678d19f 100644
--- a/paddle/fluid/prim/api/api.yaml
+++ b/paddle/fluid/prim/api/api.yaml
@@ -38,6 +38,7 @@
 - pad
 - sqrt
 - cumsum
+- cumprod
 - put_along_axis
 - sin
 - cos
diff --git a/paddle/fluid/prim/api/auto_code_generated/template/static_prim_api.cc.j2 b/paddle/fluid/prim/api/auto_code_generated/template/static_prim_api.cc.j2
index 55b65bf05163f..b1b675a78589a 100644
--- a/paddle/fluid/prim/api/auto_code_generated/template/static_prim_api.cc.j2
+++ b/paddle/fluid/prim/api/auto_code_generated/template/static_prim_api.cc.j2
@@ -1,5 +1,5 @@
 {% from "utils.cc.j2" import static_prim_api %}
-// Generated by /paddle/fluid/prim/api/auto_code_generated/static_gen.py.  
+// Generated by /paddle/fluid/prim/api/auto_code_generated/static_gen.py.
 // DO NOT EDIT!
 
 #include <string.h>
diff --git a/paddle/fluid/prim/api/auto_code_generated/template/utils.cc.j2 b/paddle/fluid/prim/api/auto_code_generated/template/utils.cc.j2
index 78a270ef37d5b..5e34af02f2857 100644
--- a/paddle/fluid/prim/api/auto_code_generated/template/utils.cc.j2
+++ b/paddle/fluid/prim/api/auto_code_generated/template/utils.cc.j2
@@ -25,7 +25,7 @@
   {% endfilter %}
   op->CheckAttrs();
   op->InferVarType(block);
-  op->InferShape(*block); 
+  op->InferShape(*block);
   {% if outputs|length > 1 %}
   return std::make_tuple{{sequence('(', ')', ', ', output_names)}};
   {% elif outputs|length == 1 %}
@@ -56,7 +56,7 @@ template <>
 {%- macro static_prim_api_sig_ret(outputs) -%}
   {%- set names = [] -%}
   {%- for i in outputs -%} {%- do names.append(i.typename|to_paddle_output_type) -%} {%- endfor -%}
-  {%- if names|length > 1 -%} 
+  {%- if names|length > 1 -%}
 std::tuple<{{sequence('', '', ', ', names)}}>
   {%- else -%}
 {{names[0]}}
@@ -80,7 +80,7 @@ if ({{input.name}}) {
   std::transform({{input.name}}.get().begin(), {{input.name}}.get().end(), {{input.name}}_names.begin(), [](const Tensor& t) {
     return std::static_pointer_cast<prim::DescTensor>(t.impl())->Name();
   });
-  op->SetInput("{{input.fluid_name | to_pascal}}", {{input.name}}_names);  
+  op->SetInput("{{input.fluid_name | to_pascal}}", {{input.name}}_names);
 }
   {%- else -%}
 if ({{input.name}}) {
@@ -96,7 +96,7 @@ std::vector<std::string> {{input.name}}_names({{input.name}}.size());;
 std::transform({{input.name}}.begin(), {{input.name}}.end(), {{input.name}}_names.begin(), [](const Tensor& t) {
   return std::static_pointer_cast<prim::DescTensor>(t.impl())->Name();
 });
-op->SetInput("{{input.fluid_name | to_pascal}}", {{input.name}}_names);  
+op->SetInput("{{input.fluid_name | to_pascal}}", {{input.name}}_names);
   {%- else -%}
 op->SetInput("{{input.fluid_name | to_pascal}}", {std::static_pointer_cast<prim::DescTensor>({{input.name}}.impl())->Name()});
   {%- endif -%}
@@ -180,7 +180,7 @@ paddle::framework::TransToProtoVarType({{src_name}})
   {%- set is_set = [] -%}  {#- why not use boolean, ref: https://stackoverflow.com/questions/17925674/jinja2-local-global-variable -#}
   {%- if not is_set -%} {#- use DataType attr as default output dtype -#}
     {%- for attr in attrs -%}
-      {%- if attr.typename is datatype -%} 
+      {%- if attr.typename is datatype -%}
 {{attr.name}}
         {%- do is_set.append(1) -%}
       {%- endif -%}
diff --git a/paddle/fluid/prim/api/composite_backward/composite_backward_api.h b/paddle/fluid/prim/api/composite_backward/composite_backward_api.h
index 0465f73a44593..17bc345917064 100644
--- a/paddle/fluid/prim/api/composite_backward/composite_backward_api.h
+++ b/paddle/fluid/prim/api/composite_backward/composite_backward_api.h
@@ -744,13 +744,20 @@ void slice_grad(const Tensor& input,
       paddings.push_back(offsets[i]);
       paddings.push_back((in_dims[i] - out_dims[i]) - offsets[i]);
     }
+    Tensor reshape_out_grad;
+    if (out_grad.shape().size() == 0) {
+      reshape_out_grad = full<T>({1}, 1, input.dtype());
+    } else {
+      reshape_out_grad = out_grad;
+    }
+
     if (decrease_size > 0 &&
         (decrease_size != static_cast<size_t>(in_dims.size()))) {
       auto out_tmp =
-          pad<T>(reshape<T>(out_grad, origin_out_shape), paddings, 0.0);
+          pad<T>(reshape<T>(reshape_out_grad, origin_out_shape), paddings, 0.0);
       set_output<T>(out_tmp, input_grad);
     } else {
-      auto out_tmp = pad<T>(out_grad, paddings, 0.0);
+      auto out_tmp = pad<T>(reshape_out_grad, paddings, 0.0);
       set_output<T>(out_tmp, input_grad);
     }
   }
@@ -1127,11 +1134,13 @@ void prod_grad(const Tensor& x,
     } else {
       reduce_all = false;
     }
-    auto x_grad_tmp = Tensor();
-    auto out_tmp = Tensor();
+    auto out_grad_tmp = Tensor();
+    auto x_reshape = Tensor();
+    std::vector<int64_t> unchange_axis, change_axis, transpose_shape,
+        cumprod_shape;
+    std::vector<int> transpose_dim, origin_position;
     if (x_dim_size == 1) {
-      x_grad_tmp = out_grad.expand(IntArray(x_dim));
-      out_tmp = out.expand(IntArray(x_dim));
+      out_grad_tmp = out_grad.expand(IntArray(x_dim));
     } else {
       if (!keep_dim) {
         auto axis_ = std::vector<int64_t>();
@@ -1149,16 +1158,69 @@ void prod_grad(const Tensor& x,
         }
         auto out_grad_shape = get_unsqueeze_dims(out_grad, axis_);
         auto out_grad_ = reshape<T>(out_grad, out_grad_shape);
-        x_grad_tmp = out_grad_.expand(IntArray(x_dim));
-        auto out_ = reshape<T>(out, out_grad_shape);
-        out_tmp = out_.expand(IntArray(x_dim));
+        out_grad_tmp = out_grad_.expand(IntArray(x_dim));
       } else {
-        x_grad_tmp = out_grad.expand(IntArray(x_dim));
-        out_tmp = out.expand(IntArray(x_dim));
+        out_grad_tmp = out_grad.expand(IntArray(x_dim));
       }
     }
-    auto x_grad_res = x_grad_tmp * out_tmp * (1 / x);
-    set_output<T>(x_grad_res, x_grad);
+    auto axis_ = std::vector<int64_t>();
+    if (reduce_all) {
+      int64_t numel = 1;
+      for (int64_t i = 0; i < x_dim_size; i++) {
+        axis_.push_back(i);
+        numel *= x_dim[i];
+      }
+      cumprod_shape.push_back(numel);
+      x_reshape = reshape<T>(x, cumprod_shape);
+      auto left_cumprod = cumprod<T>(x_reshape, -1, true, false);
+      auto right_cumprod = cumprod<T>(x_reshape, -1, true, true);
+      auto x_grad_tmp = left_cumprod * right_cumprod;
+      auto x_grad_tmp2 = reshape<T>(x_grad_tmp, x.shape());
+      auto x_grad_res = x_grad_tmp2 * out_grad_tmp;
+      set_output<T>(x_grad_res, x_grad);
+    } else {
+      int64_t unchange_size = x_dim_size - axis_size;
+      int64_t unchange_index = 0;
+      for (int64_t i = 0; i < axis_size; i++) {
+        if (axis[i] < 0) {
+          axis_.push_back(axis[i] + x_dim_size);
+        } else {
+          axis_.push_back(axis[i]);
+        }
+      }
+      for (int64_t i = 0; i < x_dim_size; i++) {
+        auto it = find(axis_.begin(), axis_.end(), i);
+        if (it != axis_.end()) {
+          int64_t index = it - axis_.begin();
+          origin_position.push_back(static_cast<int>(unchange_size + index));
+        } else {
+          unchange_axis.push_back(i);
+          origin_position.push_back(static_cast<int>(unchange_index));
+          unchange_index += 1;
+        }
+      }
+      int64_t numel = 1;
+      for (int64_t i = 0; i < unchange_size; i++) {
+        transpose_shape.push_back(x_dim[unchange_axis[i]]);
+        cumprod_shape.push_back(x_dim[unchange_axis[i]]);
+        transpose_dim.push_back(static_cast<int>(unchange_axis[i]));
+      }
+      for (int64_t i = 0; i < axis_size; i++) {
+        transpose_shape.push_back(x_dim[axis_[i]]);
+        transpose_dim.push_back(static_cast<int>(axis_[i]));
+        numel *= x_dim[axis_[i]];
+      }
+      cumprod_shape.push_back(numel);
+      auto x_transpose = transpose<T>(x, transpose_dim);
+      x_reshape = reshape<T>(x_transpose, cumprod_shape);
+      auto left_cumprod = cumprod<T>(x_reshape, -1, true, false);
+      auto right_cumprod = cumprod<T>(x_reshape, -1, true, true);
+      auto x_grad_tmp = left_cumprod * right_cumprod;
+      auto x_grad_reshape = reshape<T>(x_grad_tmp, transpose_shape);
+      auto x_grad_tmp2 = transpose<T>(x_grad_reshape, origin_position);
+      auto x_grad_res = x_grad_tmp2 * out_grad_tmp;
+      set_output<T>(x_grad_res, x_grad);
+    }
   }
 }
 
diff --git a/paddle/fluid/prim/api/manual_prim/utils/static_utils.cc b/paddle/fluid/prim/api/manual_prim/utils/static_utils.cc
index 2f76e8bbd966f..43ab21ccd3e06 100644
--- a/paddle/fluid/prim/api/manual_prim/utils/static_utils.cc
+++ b/paddle/fluid/prim/api/manual_prim/utils/static_utils.cc
@@ -23,8 +23,7 @@
 #include "paddle/fluid/prim/utils/static/static_global_utils.h"
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/core/utils/data_type.h"
-namespace paddle {
-namespace prim {
+namespace paddle::prim {
 using Tensor = paddle::Tensor;
 template <>
 TEST_API Tensor empty<DescTensor>(const paddle::experimental::IntArray& shape,
@@ -69,5 +68,4 @@ void by_pass<DescTensor>(const paddle::Tensor& x, paddle::Tensor* real_out) {
   set_output<DescTensor>(out, real_out);
 }
 
-}  // namespace prim
-}  // namespace paddle
+}  // namespace paddle::prim
diff --git a/paddle/fluid/prim/utils/static/static_global_utils.cc b/paddle/fluid/prim/utils/static/static_global_utils.cc
index 3d1aa2158048d..71179429dc997 100644
--- a/paddle/fluid/prim/utils/static/static_global_utils.cc
+++ b/paddle/fluid/prim/utils/static/static_global_utils.cc
@@ -14,12 +14,10 @@
 
 #include "paddle/fluid/prim/utils/static/static_global_utils.h"
 
-namespace paddle {
-namespace prim {
+namespace paddle::prim {
 StaticCompositeContext* StaticCompositeContext::static_composite_context_ =
     new StaticCompositeContext();
 thread_local bool StaticCompositeContext::enable_bwd_prim_ = false;
 thread_local bool StaticCompositeContext::enable_fwd_prim_ = false;
 thread_local bool StaticCompositeContext::enable_eager_prim_ = false;
-}  // namespace prim
-}  // namespace paddle
+}  // namespace paddle::prim
diff --git a/paddle/fluid/primitive/codegen/templates/backend/generated/generated_eager_backend.cc.j2 b/paddle/fluid/primitive/codegen/templates/backend/generated/generated_eager_backend.cc.j2
index 7f9f4b5b8676f..b8910ff5b9d9a 100644
--- a/paddle/fluid/primitive/codegen/templates/backend/generated/generated_eager_backend.cc.j2
+++ b/paddle/fluid/primitive/codegen/templates/backend/generated/generated_eager_backend.cc.j2
@@ -12,7 +12,7 @@ namespace backend {
 
 {%- macro args(inputs, attrs) -%}  {#- Arguments are variable pass into method -#}
   {{common.sequence('', '', ', ', inputs)}}
-  {%- if attrs|length > 0 -%} {{", "}} {%- endif -%} {#- append comma between 
+  {%- if attrs|length > 0 -%} {{", "}} {%- endif -%} {#- append comma between
   nputs and attrs -#}
   {{common.sequence('', '', ', ', attrs)}}
 {%- endmacro -%}
@@ -37,7 +37,7 @@ return ::{{name}}_ad_func({{common.args(input_names, attr_names)}});
 {% for api in apis %}
   {%- if api.is_prim and api.name not in backend_black_list and api.name[-1] !=  '_' -%}
 {{sig(api.name, api.inputs, api.attrs, api.outputs | trip_intermediate)}} {
-{{body(api.name, api.inputs, api.attrs, api.outputs | trip_intermediate)}} 
+{{body(api.name, api.inputs, api.attrs, api.outputs | trip_intermediate)}}
 }
 
   {% endif %}
diff --git a/paddle/fluid/primitive/codegen/templates/backend/generated/generated_static_backend.cc.j2 b/paddle/fluid/primitive/codegen/templates/backend/generated/generated_static_backend.cc.j2
index 26f81d756f0b5..8e4921acbb013 100644
--- a/paddle/fluid/primitive/codegen/templates/backend/generated/generated_static_backend.cc.j2
+++ b/paddle/fluid/primitive/codegen/templates/backend/generated/generated_static_backend.cc.j2
@@ -117,20 +117,20 @@ pir::Value {{attr.name}}_res = std::static_pointer_cast<LazyTensor>({{attr.name~
     {% endif %}
   {% endfor %}
   {%- set input_names = [] -%}
-  {%- for i in inputs -%} 
-    {%- do input_names.append(i.name~'_res') -%} 
+  {%- for i in inputs -%}
+    {%- do input_names.append(i.name~'_res') -%}
   {%- endfor -%}
   {%- if mutable_attribute_as_inputs -%}
-    {%- for i in attrs -%} 
+    {%- for i in attrs -%}
       {%- if i is mutable_attribute -%}
-        {%- do input_names.append(i.name~'_res') -%} 
+        {%- do input_names.append(i.name~'_res') -%}
       {%- endif -%}
     {%- endfor -%}
   {%- endif -%}
   {%- set attr_names = [] -%}
-  {%- for i in attrs -%} 
+  {%- for i in attrs -%}
     {%- if  not mutable_attribute_as_inputs or mutable_attribute_as_inputs and i is not mutable_attribute -%}{#- do nothing -#}
-      {%- do attr_names.append(common.phi2ir_attr(i)) -%} 
+      {%- do attr_names.append(common.phi2ir_attr(i)) -%}
     {%- endif -%}
   {% endfor %}
 auto op_res = paddle::dialect::{{name}}({{common.args(input_names, attr_names)}});
@@ -145,14 +145,14 @@ auto op_res = paddle::dialect::{{name}}({{common.args(input_names, attr_names)}}
   {% set api_outputs = api.outputs | trip_intermediate %}
 {{sig(api.name, api.inputs, api_outputs, api.attrs)}} {
   {% filter indent(2, True) %}
-{{body(api.name, api.inputs, api_outputs, api.attrs)}} 
+{{body(api.name, api.inputs, api_outputs, api.attrs)}}
   {% endfilter %}
 }
 
     {% if api.attrs is exist_mutable_attribute %}
 {{sig(api.name, api.inputs, api_outputs, api.attrs, True)}} {
   {% filter indent(2, True) %}
-{{body(api.name, api.inputs, api_outputs, api.attrs, True)}} 
+{{body(api.name, api.inputs, api_outputs, api.attrs, True)}}
   {% endfilter %}
 }
 
diff --git a/paddle/fluid/primitive/codegen/templates/common.j2 b/paddle/fluid/primitive/codegen/templates/common.j2
index b29401133db03..ecf5e54cae33b 100644
--- a/paddle/fluid/primitive/codegen/templates/common.j2
+++ b/paddle/fluid/primitive/codegen/templates/common.j2
@@ -8,12 +8,12 @@ template <typename T>
   {%- set input_params = [] -%}
   {%- for i in inputs -%} {%- do input_params.append(i.typename|to_paddle_input_type(i.optional)~' '~i.name) -%} {%- endfor -%}
   {%- set attr_params = [] -%}
-  {%- for i in attrs -%} 
+  {%- for i in attrs -%}
     {%- if not mutable_attribute_as_inputs or i is not mutable_attribute -%}
       {%- if default -%}
-        {%- do attr_params.append(i.typename|to_paddle_attr_type~' '~i.name~default_value(i)) -%} 
+        {%- do attr_params.append(i.typename|to_paddle_attr_type~' '~i.name~default_value(i)) -%}
       {%- else -%}
-        {%- do attr_params.append(i.typename|to_paddle_attr_type~' '~i.name) -%} 
+        {%- do attr_params.append(i.typename|to_paddle_attr_type~' '~i.name) -%}
       {%- endif -%}
     {%- else -%}
       {%- do input_params.append('const Tensor&'~' '~i.name~'_') -%}
@@ -43,7 +43,7 @@ template <typename T>
 {%- macro ret(outputs) -%}
   {%- set names = [] -%}
   {%- for i in outputs -%} {%- do names.append(i.typename|to_paddle_output_type(i.optional)) -%} {%- endfor -%}
-  {%- if names|length > 1 -%} 
+  {%- if names|length > 1 -%}
 std::tuple<{{sequence('', '', ', ', names)}}>
   {%- else -%}
 {{names[0]}}
diff --git a/paddle/fluid/primitive/codegen/templates/decomp/generated_decomp_vjp.j2 b/paddle/fluid/primitive/codegen/templates/decomp/generated_decomp_vjp.j2
index 460b8e3a2fcdc..592b45b84aa72 100644
--- a/paddle/fluid/primitive/codegen/templates/decomp/generated_decomp_vjp.j2
+++ b/paddle/fluid/primitive/codegen/templates/decomp/generated_decomp_vjp.j2
@@ -139,13 +139,13 @@ std::vector<std::vector<pir::Value>> {{class_name}}::DecompVjp(pir::Operation* o
     auto stop_gradients_attr = op->attribute(kAttrStopGradients)
                                    .dyn_cast<pir::ArrayAttribute>()
                                    .AsVector();
-    {% for k in range(outputs|length) %}         
+    {% for k in range(outputs|length) %}
     stop_gradients[{{k}}].push_back(
         stop_gradients_attr[{{k}}].dyn_cast<pir::BoolAttribute>().data());
-    {% endfor %} 
+    {% endfor %}
     VLOG(4) << " stop_gradients is set ";
   } else {
-    {% for k in range(outputs|length) %} 
+    {% for k in range(outputs|length) %}
     stop_gradients[{{k}}].push_back(false);
     {% endfor %}
     VLOG(4) << " stop_gradients is not set ";
@@ -160,7 +160,7 @@ std::vector<std::vector<pir::Value>> {{class_name}}::DecompVjp(pir::Operation* o
   VLOG(4) << "Call Pir Decomposed backward op {{fwd_name}}";
 
 
-  {% for k in range(outputs|length) %} 
+  {% for k in range(outputs|length) %}
   paddle::Tensor* {{outputs[k].name}} = !stop_gradients[{{k}}][0] ? &tensor_res[{{k}}][0] : nullptr;
   {% endfor %}
 
diff --git a/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.cc.j2 b/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.cc.j2
index 105175758f22d..31ec42aacd7a9 100644
--- a/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.cc.j2
+++ b/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.cc.j2
@@ -79,11 +79,11 @@ auto {{i.name}} = phi::IntArray(paddle::dialect::GetInt64Vector({{i.name}}_defin
   {%- for api in apis -%} {%- do api_map.update({api.name: api}) -%} {%- endfor -%}
   {%- for i in api.inputs -%} {%- do input_names.append(i.name) -%} {%- endfor -%}
   {%- set attr_names=[] -%}
-  {%- for i in api.attrs -%} 
+  {%- for i in api.attrs -%}
     {%- if i is mutable_attribute -%}
-      {%- do input_names.append(i.name~'_') -%} 
+      {%- do input_names.append(i.name~'_') -%}
     {%- else -%}
-      {%- do attr_names.append(i.name) -%} 
+      {%- do attr_names.append(i.name) -%}
     {%- endif -%}
   {%- endfor %}
   {% if 'invoke' in api and api.invoke.func in api_map %}
@@ -116,7 +116,7 @@ FLAGS_tensor_operants_mode = "static";
 VLOG(4) << "Call Pir Decomposed backward op {{api.name}}";
   {% for i in range(api.outputs|length) %}
     {% if api.outputs[i].typename=='Tensor' %}
-paddle::Tensor* {{api.outputs[i].name}} = !stop_gradients[{{i}}][0] ? &vjp_res[{{i}}][0] : nullptr; 
+paddle::Tensor* {{api.outputs[i].name}} = !stop_gradients[{{i}}][0] ? &vjp_res[{{i}}][0] : nullptr;
     {% else %}
 std::vector<paddle::Tensor*> {{api.outputs[i].name}}(stop_gradients[{{i}}].size(), nullptr);
 for (size_t i=0; i< stop_gradients[{{i}}].size(); i++ ) {
diff --git a/paddle/fluid/primitive/composite/composite.h b/paddle/fluid/primitive/composite/composite.h
index 091d540aa461a..7b08f9f6571fd 100644
--- a/paddle/fluid/primitive/composite/composite.h
+++ b/paddle/fluid/primitive/composite/composite.h
@@ -23,9 +23,6 @@ namespace paddle {
 namespace primitive {
 namespace details {
 
-// empty_shape means x.shape=[]
-static std::vector<int64_t> empty_shape;
-
 template <typename T>
 static Tensor get_slice(const Tensor& x, int64_t idx) {
   return slice<T>(x, {0}, {idx}, {idx + 1}, {1}, {});
@@ -98,7 +95,7 @@ Tensor mean_decomp(const Tensor& x, const IntArray& axis, bool keepdim) {
     for (size_t i = 0; i < axis_.size(); i++) {
       value_ *= x_dim[axis_[i]];
     }
-    value = full<T>(empty_shape, value_, sum_x.dtype());
+    value = full_scalar<T>(value_, sum_x.dtype());
   }
 
   Tensor res = sum_x / value;
@@ -148,7 +145,7 @@ Tensor p_norm_decomp(const Tensor& x,
   Tensor res;
   if (porder == 0.0) {
     // 0-norm
-    auto zero = full<T>(empty_shape, 0, x_tmp.dtype());
+    auto zero = full_scalar<T>(0, x_tmp.dtype());
     auto none_zero = not_equal<T>(x_tmp, zero);
     res = cast<T>(none_zero, x_tmp.dtype());
     res = sum<T>(res, {axis}, x_tmp.dtype(), keepdim);
@@ -169,8 +166,8 @@ Tensor p_norm_decomp(const Tensor& x,
     res = min<T>(x_tmp, {axis}, keepdim);
   } else {
     // vanilla p-norm
-    auto porder_tensor = full<T>(empty_shape, porder, x_tmp.dtype());
-    auto inv_porder_tensor = full<T>(empty_shape, 1 / porder, x_tmp.dtype());
+    auto porder_tensor = full_scalar<T>(porder, x_tmp.dtype());
+    auto inv_porder_tensor = full_scalar<T>(1 / porder, x_tmp.dtype());
     res = elementwise_pow<T>(x_tmp, porder_tensor);
     res = sum<T>(res, {axis}, x_tmp.dtype(), keepdim);
     res = elementwise_pow<T>(res, inv_porder_tensor);
@@ -194,8 +191,7 @@ Tensor pow_decomp(const Tensor& x, const paddle::Scalar& y) {
   }
 
   check_valid_type(y.dtype());
-  Tensor y_full = full<T>(empty_shape, y, x_cast.dtype());
-
+  Tensor y_full = full_scalar<T>(y, x_cast.dtype());
   auto ans = elementwise_pow<T>(x_cast, y_full);
   if (need_cast) {
     return cast<T>(ans, org_dtype);
@@ -282,13 +278,13 @@ Tensor squared_l2_norm_decomp(const Tensor& x) {
 
 template <typename T>
 Tensor reciprocal_decomp(const Tensor& x) {
-  return full<T>(empty_shape, 1.0, x.dtype()) / x;
+  return full_scalar<T>(1.0, x.dtype()) / x;
 }
 
 template <typename T>
 Tensor bce_loss_decomp(const Tensor& x, const Tensor& label) {
-  auto one = full<T>(empty_shape, 1, x.dtype());
-  auto ans = full<T>(empty_shape, -1, x.dtype()) *
+  auto one = full_scalar<T>(1, x.dtype());
+  auto ans = full_scalar<T>(-1, x.dtype()) *
              (label * log<T>(x) + (one - label) * log<T>(one - x));
   return ans;
 }
@@ -382,7 +378,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> batch_norm_decomp(
     }
   }
 
-  Tensor half = full<T>(empty_shape, -0.5, x_cast.dtype());
+  Tensor half = full_scalar<T>(-0.5, x_cast.dtype());
 
   bool use_run_stat = (is_test && (!trainable_statistics)) || use_global_stats;
   Tensor x_hat;
@@ -421,9 +417,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> batch_norm_decomp(
     run_var_ = assign<T>(run_var);
   }
   Tensor y;
-  Tensor new_scale =
-      scale ? scale.get() : full<T>(empty_shape, 1, x_cast.dtype());
-  Tensor new_bias = bias ? bias.get() : full<T>(empty_shape, 0, x_cast.dtype());
+  Tensor new_scale = scale ? scale.get() : full_scalar<T>(1, x_cast.dtype());
+  Tensor new_bias = bias ? bias.get() : full_scalar<T>(0, x_cast.dtype());
   if (data_layout_ == DataLayout::kNHWC) {
     y = x_hat * new_scale + new_bias;
   } else {
@@ -539,13 +534,13 @@ Tensor swiglu_decomp(const Tensor& x, const paddle::optional<Tensor>& y) {
 
 template <typename T>
 Tensor relu_decomp(const Tensor& x) {
-  return maximum<T>(x, full<T>(empty_shape, 0.0, x.dtype()));
+  return maximum<T>(x, full_scalar<T>(0.0, x.dtype()));
 }
 
 template <typename T>
 Tensor relu6_decomp(const Tensor& x) {
-  auto tmp = maximum<T>(x, full<T>(empty_shape, 0.0, x.dtype()));
-  auto res = minimum<T>(tmp, full<T>(empty_shape, 6.0, x.dtype()));
+  auto tmp = maximum<T>(x, full_scalar<T>(0.0, x.dtype()));
+  auto res = minimum<T>(tmp, full_scalar<T>(6.0, x.dtype()));
   return res;
 }
 
@@ -653,7 +648,7 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_decomp(
     auto difference = x_cast - mean_;
     auto var_tmp1 = difference * difference;
     auto variance = mean_decomp<T>(var_tmp1, axis, true);
-    auto var_tmp3 = variance + full<T>(empty_shape, epsilon, variance.dtype());
+    auto var_tmp3 = variance + full_scalar<T>(epsilon, variance.dtype());
     auto rsqrt_var = rsqrt<T>(var_tmp3);
     auto out = difference * rsqrt_var;
 
@@ -798,18 +793,18 @@ std::tuple<Tensor, Tensor> dropout_decomp(
   Tensor uniform_tensor;
   if (has_dynamic_shape(x.shape())) {
     auto shape_tensor = shape<T>(x);
-    auto zero = full<T>(empty_shape, 0.0, dtype_tmp);
-    auto one = full<T>(empty_shape, 1.0, dtype_tmp);
+    auto zero = full_scalar<T>(0.0, dtype_tmp);
+    auto one = full_scalar<T>(1.0, dtype_tmp);
     uniform_tensor =
         backend::uniform<T>(shape_tensor, zero, one, dtype_tmp, seed_tmp);
   } else {
     uniform_tensor =
         uniform<T>(phi::vectorize(x.dims()), dtype_tmp, 0.0, 1.0, seed_tmp);
   }
-  auto mask = cast<T>(
-      greater_equal<T>(uniform_tensor, full<T>(empty_shape, p, dtype_tmp)),
-      org_dtype);
-  auto ones_p = full<T>(empty_shape, 1.0 - p.to<float>(), org_dtype);
+  auto mask =
+      cast<T>(greater_equal<T>(uniform_tensor, full_scalar<T>(p, dtype_tmp)),
+              org_dtype);
+  auto ones_p = full_scalar<T>(1.0 - p.to<float>(), org_dtype);
   if (upscale_in_train) {
     if (is_test) {
       // inference: out = input
@@ -818,7 +813,7 @@ std::tuple<Tensor, Tensor> dropout_decomp(
       // train: out = input * mask / ( 1.0 - p )
       if (p.to<float>() == 1.0) {
         // Process p=1. for avoid divide zero error (x*mask/(1.0-p))
-        auto zero = full<T>(empty_shape, 0.0, org_dtype);
+        auto zero = full_scalar<T>(0.0, org_dtype);
         return std::make_tuple(x * zero, cast<T>(zero, DataType::UINT8));
       } else {
         auto ans = (x * mask) / ones_p;
@@ -842,20 +837,20 @@ Tensor gelu_decomp(const Tensor& x, bool approximate) {
   const double PM_SQRT1_2 = 0.70710678118654752440;  /* 1/sqrt(2) */
 
   auto org_dtype = x.dtype();
-  auto half = full<T>(empty_shape, 0.5, org_dtype);
-  auto one = full<T>(empty_shape, 1.0, org_dtype);
+  auto half = full_scalar<T>(0.5, org_dtype);
+  auto one = full_scalar<T>(1.0, org_dtype);
   if (approximate) {
     // gelu(x) = 0.5 * x * (1 + tanh(sqrt(2 / \pi) * (x + 0.044715 * x^{3})))
-    auto kAlpha = full<T>(empty_shape, PM_2_SQRTPI * PM_SQRT1_2, org_dtype);
-    auto GELU_CONSTANT = full<T>(empty_shape, 0.044715, org_dtype);
-    auto x_pow3 = elementwise_pow<T>(x, full<T>(empty_shape, 3, org_dtype));
+    auto kAlpha = full_scalar<T>(PM_2_SQRTPI * PM_SQRT1_2, org_dtype);
+    auto GELU_CONSTANT = full_scalar<T>(0.044715, org_dtype);
+    auto x_pow3 = elementwise_pow<T>(x, full_scalar<T>(3, org_dtype));
     auto tanh_out = tanh<T>(kAlpha * (x + x_pow3 * GELU_CONSTANT));
 
     auto res = x * half * (one + tanh_out);
     return res;
   } else {
     // gelu(x) = 0.5 * x *  (1 + erf(x / sqrt(2)))
-    auto M_SQRT1_2T = full<T>(empty_shape, PM_SQRT1_2, org_dtype);
+    auto M_SQRT1_2T = full_scalar<T>(PM_SQRT1_2, org_dtype);
     auto erf_out = one + erf<T>(x * M_SQRT1_2T);
 
     auto res = x * half * erf_out;
@@ -867,10 +862,10 @@ template <typename T>
 Tensor hardsigmoid_decomp(const Tensor& x, float slope, float offset) {
   const double MAX_VALUE = 1.0;
   const double MIN_VALUE = 0.0;
-  return maximum<T>(minimum<T>(x * full<T>(empty_shape, slope, x.dtype()) +
-                                   full<T>(empty_shape, offset, x.dtype()),
-                               full<T>(empty_shape, MAX_VALUE, x.dtype())),
-                    full<T>(empty_shape, MIN_VALUE, x.dtype()));
+  return maximum<T>(minimum<T>(x * full_scalar<T>(slope, x.dtype()) +
+                                   full_scalar<T>(offset, x.dtype()),
+                               full_scalar<T>(MAX_VALUE, x.dtype())),
+                    full_scalar<T>(MIN_VALUE, x.dtype()));
 }
 
 template <typename T>
@@ -881,15 +876,15 @@ Tensor hardswish_decomp(const Tensor& x) {
 
   // out = minimum(maximum(x + offset, 0), threshold) * x / scale
   auto minimum_out =
-      minimum<T>(maximum<T>(x + full<T>(empty_shape, OFFSET, x.dtype()),
-                            full<T>(empty_shape, 0.0, x.dtype())),
-                 full<T>(empty_shape, THRESHOLD, x.dtype()));
-  return (minimum_out * x) / full<T>(empty_shape, SCALE, x.dtype());
+      minimum<T>(maximum<T>(x + full_scalar<T>(OFFSET, x.dtype()),
+                            full_scalar<T>(0.0, x.dtype())),
+                 full_scalar<T>(THRESHOLD, x.dtype()));
+  return (minimum_out * x) / full_scalar<T>(SCALE, x.dtype());
 }
 
 template <typename T>
 Tensor leaky_relu_decomp(const Tensor& x, float negative_slope) {
-  auto multiply_tmp = full<T>(empty_shape, negative_slope, x.dtype()) * x;
+  auto multiply_tmp = full_scalar<T>(negative_slope, x.dtype()) * x;
   if (negative_slope < 1.0) {
     return maximum<T>(x, multiply_tmp);
   } else {
@@ -1127,8 +1122,7 @@ std::tuple<Tensor, Tensor, Tensor> group_norm_decomp(
     var_ = maximum<T>(
         var_tmp_,
         backend::full_with_tensor<T>(shape<T>(var_tmp_), 0, var_tmp_.dtype()));
-    Tensor var_inv =
-        rsqrt<T>(var_ + full<T>(empty_shape, epsilon, var_.dtype()));
+    Tensor var_inv = rsqrt<T>(var_ + full_scalar<T>(epsilon, var_.dtype()));
     Tensor res = (x_cast - mean_) * var_inv;
     out = backend::reshape<T>(res, x_dim_t);
   } else {
@@ -1143,7 +1137,7 @@ std::tuple<Tensor, Tensor, Tensor> group_norm_decomp(
     auto var_tmp_ =
         mean_decomp<T>(x_cast * x_cast, c_axis, true) - mean_ * mean_;
     var_ = maximum<T>(var_tmp_, full<T>(var_tmp_.shape(), 0, var_tmp_.dtype()));
-    auto var_inv = rsqrt<T>(var_ + full<T>(empty_shape, epsilon, var_.dtype()));
+    auto var_inv = rsqrt<T>(var_ + full_scalar<T>(epsilon, var_.dtype()));
     auto res = (x_cast - mean_) * var_inv;
     out = reshape<T>(res, x_dim);
   }
@@ -1207,7 +1201,7 @@ Tensor square_decomp(const Tensor& x) {
   }
 
   Tensor two;
-  two = full<T>(empty_shape, 2, x_cast.dtype());
+  two = full_scalar<T>(2, x_cast.dtype());
 
   auto ans = elementwise_pow<T>(x_cast, two);
   if (need_cast) {
@@ -1224,9 +1218,8 @@ Tensor sigmoid_cross_entropy_with_logits_decomp(
     const paddle::optional<Tensor>& pos_weight,
     bool normalize,
     int ignore_index) {
-  auto dims = x.shape();
-  const Tensor zero = full<T>(dims, 0, x.type());
-  const Tensor one = full<T>(dims, 1, x.type());
+  const Tensor zero = full_like_decomp<T>(x, 0, x.type(), x.place());
+  const Tensor one = full_like_decomp<T>(x, 1, x.type(), x.place());
   Tensor pos_weight_tensor;
   if (pos_weight) {
     pos_weight_tensor = pos_weight.get();
@@ -1235,19 +1228,20 @@ Tensor sigmoid_cross_entropy_with_logits_decomp(
   }
   auto term1 = where<T>(x > zero, x, zero);
   auto term2 = x * label;
-  auto term3 = log<T>(1 + exp<T>(-abs<T>(x)));
+  auto term3 = log<T>(one + exp<T>(-abs<T>(x)));
   const Tensor tmp_out = term1 - term2 + term3 * pos_weight_tensor;
-  const Tensor ignore_index_tensor = full<T>(dims, ignore_index, label.type());
+  const Tensor ignore_index_tensor =
+      full_like_decomp<T>(x, ignore_index, label.type(), label.place());
   auto out = where<T>(label == ignore_index_tensor, zero, tmp_out);
   if (normalize) {
     // Follow the implementation in
     // paddle/phi/kernels/cpu/sigmoid_cross_entropy_with_logits_kernel.cc
-    const Tensor eps1 = full<T>(dims, 1e-6, x.type());
+    const Tensor eps1 = full_like_decomp<T>(x, 1e-6, x.type(), x.place());
     auto diff = label - ignore_index_tensor;
     const Tensor tmp_norm = sum<T>(where<T>(abs<T>(diff) > eps1, one, zero));
     // Follow the implementation in
     // paddle/phi/kernels/cpu/sigmoid_cross_entropy_with_logits_kernel.cc
-    const Tensor eps2 = full<T>(empty_shape, 1e-5, x.type());
+    const Tensor eps2 = full_scalar<T>(1e-5, x.type());
     auto norm = where<T>(tmp_norm > eps2, tmp_norm, eps2);
     out = out / norm;
   }
@@ -1387,8 +1381,8 @@ Tensor elu_decomp(const Tensor& x, const float alpha) {
 
   if (has_dynamic_shape(x_cast.shape())) {
     zero = backend::full_with_tensor<T>(shape<T>(x_cast), 0, x_cast.dtype());
-    tmp_res = full<T>(empty_shape, alpha, x_cast.dtype()) *
-              (exp<T>(x_cast) - full<T>(empty_shape, 1, x_cast.dtype()));
+    tmp_res = full_scalar<T>(alpha, x_cast.dtype()) *
+              (exp<T>(x_cast) - full_scalar<T>(1, x_cast.dtype()));
   } else {
     zero = full<T>(x_cast.shape(), 0, x_cast.type());
     tmp_res = alpha * (exp<T>(x_cast) - 1);
diff --git a/paddle/fluid/primitive/manual/manual_primitive.h b/paddle/fluid/primitive/manual/manual_primitive.h
index f2ec3ebce45b3..6587adf862a6e 100644
--- a/paddle/fluid/primitive/manual/manual_primitive.h
+++ b/paddle/fluid/primitive/manual/manual_primitive.h
@@ -30,6 +30,15 @@ Tensor full(const IntArray& shape,
   return backend::full<T>(shape, value, dtype, place);
 }
 
+template <typename T>
+Tensor full_scalar(const Scalar& value,
+                   DataType dtype = DataType::FLOAT32,
+                   Place place = Place()) {
+  // empty_shape means x.shape=[]
+  std::vector<int64_t> empty_shape;
+  return backend::full<T>(empty_shape, value, dtype, place);
+}
+
 template <typename T>
 Tensor assign_out_(const Tensor& x, const Tensor& output) {
   return backend::assign_out_<T>(x, output);
diff --git a/paddle/fluid/primitive/rule/vjp/details.h b/paddle/fluid/primitive/rule/vjp/details.h
index 00e464859e29e..42d3ce7d97dfd 100644
--- a/paddle/fluid/primitive/rule/vjp/details.h
+++ b/paddle/fluid/primitive/rule/vjp/details.h
@@ -1483,13 +1483,20 @@ void slice_grad(const Tensor& input,
       paddings.push_back(offsets[i]);
       paddings.push_back((in_dims[i] - out_dims[i]) - offsets[i]);
     }
+    Tensor reshape_out_grad;
+    if (out_grad.shape().size() == 0) {
+      reshape_out_grad = full<T>({1}, 1, input.dtype());
+    } else {
+      reshape_out_grad = out_grad;
+    }
+
     if (decrease_size > 0 &&
         (decrease_size != static_cast<size_t>(in_dims.size()))) {
       auto out_tmp =
-          pad<T>(reshape<T>(out_grad, origin_out_shape), paddings, 0.0);
+          pad<T>(reshape<T>(reshape_out_grad, origin_out_shape), paddings, 0.0);
       set_output<T>(out_tmp, input_grad);
     } else {
-      auto out_tmp = pad<T>(out_grad, paddings, 0.0);
+      auto out_tmp = pad<T>(reshape_out_grad, paddings, 0.0);
       set_output<T>(out_tmp, input_grad);
     }
   }
@@ -1548,7 +1555,8 @@ void leaky_relu_grad(const Tensor& out,
 template <typename T>
 void sigmoid_grad(const Tensor& out, const Tensor& out_grad, Tensor* x_grad) {
   if (x_grad) {
-    set_output<T>(out_grad * (out * (1 - out)), x_grad);
+    auto one_tensor = full_scalar<T>(1.0, out.dtype());
+    set_output<T>(out_grad * (out * (one_tensor - out)), x_grad);
   }
 }
 
@@ -1772,11 +1780,13 @@ void prod_grad(const Tensor& x,
     } else {
       reduce_all = false;
     }
-    auto x_grad_tmp = Tensor();
-    auto out_tmp = Tensor();
+    auto out_grad_tmp = Tensor();
+    auto x_reshape = Tensor();
+    std::vector<int64_t> unchange_axis, change_axis, transpose_shape,
+        cumprod_shape;
+    std::vector<int> transpose_dim, origin_position;
     if (x_dim_size == 1) {
-      x_grad_tmp = out_grad.expand(IntArray(x_dim));
-      out_tmp = out.expand(IntArray(x_dim));
+      out_grad_tmp = out_grad.expand(IntArray(x_dim));
     } else {
       if (!keep_dim) {
         auto axis_ = std::vector<int64_t>();
@@ -1794,16 +1804,69 @@ void prod_grad(const Tensor& x,
         }
         auto out_grad_shape = get_unsqueeze_dims(out_grad, axis_);
         auto out_grad_ = reshape<T>(out_grad, out_grad_shape);
-        x_grad_tmp = out_grad_.expand(IntArray(x_dim));
-        auto out_ = reshape<T>(out, out_grad_shape);
-        out_tmp = out_.expand(IntArray(x_dim));
+        out_grad_tmp = out_grad_.expand(IntArray(x_dim));
       } else {
-        x_grad_tmp = out_grad.expand(IntArray(x_dim));
-        out_tmp = out.expand(IntArray(x_dim));
+        out_grad_tmp = out_grad.expand(IntArray(x_dim));
       }
     }
-    auto x_grad_res = x_grad_tmp * out_tmp * (1 / x);
-    set_output<T>(x_grad_res, x_grad);
+    auto axis_ = std::vector<int64_t>();
+    if (reduce_all) {
+      int64_t numel = 1;
+      for (int64_t i = 0; i < x_dim_size; i++) {
+        axis_.push_back(i);
+        numel *= x_dim[i];
+      }
+      cumprod_shape.push_back(numel);
+      x_reshape = reshape<T>(x, cumprod_shape);
+      auto left_cumprod = cumprod<T>(x_reshape, -1, true, false);
+      auto right_cumprod = cumprod<T>(x_reshape, -1, true, true);
+      auto x_grad_tmp = left_cumprod * right_cumprod;
+      auto x_grad_tmp2 = reshape<T>(x_grad_tmp, x.shape());
+      auto x_grad_res = x_grad_tmp2 * out_grad_tmp;
+      set_output<T>(x_grad_res, x_grad);
+    } else {
+      int64_t unchange_size = x_dim_size - axis_size;
+      int64_t unchange_index = 0;
+      for (int64_t i = 0; i < axis_size; i++) {
+        if (axis[i] < 0) {
+          axis_.push_back(axis[i] + x_dim_size);
+        } else {
+          axis_.push_back(axis[i]);
+        }
+      }
+      for (int64_t i = 0; i < x_dim_size; i++) {
+        auto it = find(axis_.begin(), axis_.end(), i);
+        if (it != axis_.end()) {
+          int64_t index = it - axis_.begin();
+          origin_position.push_back(static_cast<int>(unchange_size + index));
+        } else {
+          unchange_axis.push_back(i);
+          origin_position.push_back(static_cast<int>(unchange_index));
+          unchange_index += 1;
+        }
+      }
+      int64_t numel = 1;
+      for (int64_t i = 0; i < unchange_size; i++) {
+        transpose_shape.push_back(x_dim[unchange_axis[i]]);
+        cumprod_shape.push_back(x_dim[unchange_axis[i]]);
+        transpose_dim.push_back(static_cast<int>(unchange_axis[i]));
+      }
+      for (int64_t i = 0; i < axis_size; i++) {
+        transpose_shape.push_back(x_dim[axis_[i]]);
+        transpose_dim.push_back(static_cast<int>(axis_[i]));
+        numel *= x_dim[axis_[i]];
+      }
+      cumprod_shape.push_back(numel);
+      auto x_transpose = transpose<T>(x, transpose_dim);
+      x_reshape = reshape<T>(x_transpose, cumprod_shape);
+      auto left_cumprod = cumprod<T>(x_reshape, -1, true, false);
+      auto right_cumprod = cumprod<T>(x_reshape, -1, true, true);
+      auto x_grad_tmp = left_cumprod * right_cumprod;
+      auto x_grad_reshape = reshape<T>(x_grad_tmp, transpose_shape);
+      auto x_grad_tmp2 = transpose<T>(x_grad_reshape, origin_position);
+      auto x_grad_res = x_grad_tmp2 * out_grad_tmp;
+      set_output<T>(x_grad_res, x_grad);
+    }
   }
 }
 
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 6deffc89271f9..aec35c6f6896a 100755
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -14,7 +14,6 @@ set(PYBIND_DEPS
     pass
     generate_pass
     pass_builder
-    parallel_executor
     compiled_program
     profiler
     layer
@@ -130,7 +129,6 @@ set(PYBIND_SRCS
     protobuf.cc
     exception.cc
     op_function_common.cc
-    parallel_executor.cc
     compiled_program.cc
     tensor.cc
     place.cc
diff --git a/paddle/fluid/pybind/graph.cc b/paddle/fluid/pybind/graph.cc
index 6acba237ba928..4e5329bbf2bfc 100644
--- a/paddle/fluid/pybind/graph.cc
+++ b/paddle/fluid/pybind/graph.cc
@@ -47,8 +47,7 @@ using paddle::framework::ir::NodeComp;
 using paddle::framework::ir::TopologySortOperations;
 using pybind11::return_value_policy;
 
-namespace paddle {
-namespace pybind {
+namespace paddle::pybind {
 void BindGraph(py::module *m) {
   m->def("graph_safe_remove_nodes",
          [](Graph *graph, const std::unordered_set<const Node *> &nodes) {
@@ -408,5 +407,4 @@ void BindPass(py::module *m) {
          });
 }
 
-}  // namespace pybind
-}  // namespace paddle
+}  // namespace paddle::pybind
diff --git a/paddle/fluid/pybind/manual_static_op_function.h b/paddle/fluid/pybind/manual_static_op_function.h
index 872be599d9a76..f41950db85e6d 100644
--- a/paddle/fluid/pybind/manual_static_op_function.h
+++ b/paddle/fluid/pybind/manual_static_op_function.h
@@ -81,7 +81,7 @@ static PyObject *static_api_set_parameter(PyObject *self,
   }
 }
 
-static PyObject *static_api_updata_parameter(PyObject *self,
+static PyObject *static_api_update_parameter(PyObject *self,
                                              PyObject *args,
                                              PyObject *kwargs) {
   try {
@@ -98,7 +98,7 @@ static PyObject *static_api_updata_parameter(PyObject *self,
     // Call ir static api
     CallStackRecorder callstack_recoder("uodata_parameter");
     callstack_recoder.Record();
-    paddle::dialect::updata_parameter(parameter, name);
+    paddle::dialect::update_parameter(parameter, name);
     callstack_recoder.AttachToOps();
     Py_RETURN_NONE;
   } catch (...) {
@@ -975,10 +975,10 @@ static PyMethodDef ManualOpsAPI[] = {
      (PyCFunction)(void (*)(void))static_api_set_parameter,
      METH_VARARGS | METH_KEYWORDS,
      "C++ interface function for set_parameter."},
-    {"updata_parameter",
-     (PyCFunction)(void (*)(void))static_api_updata_parameter,
+    {"update_parameter",
+     (PyCFunction)(void (*)(void))static_api_update_parameter,
      METH_VARARGS | METH_KEYWORDS,
-     "C++ interface function for updata_parameter."},
+     "C++ interface function for update_parameter."},
     {"set_persistable_value",
      (PyCFunction)(void (*)(void))static_api_set_persistable_value,
      METH_VARARGS | METH_KEYWORDS,
diff --git a/paddle/fluid/pybind/parallel_executor.cc b/paddle/fluid/pybind/parallel_executor.cc
deleted file mode 100644
index 7f6b054564bc6..0000000000000
--- a/paddle/fluid/pybind/parallel_executor.cc
+++ /dev/null
@@ -1,1178 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <Python.h>
-// Avoid a problem with copysign defined in pyconfig.h on Windows.
-#ifdef copysign
-#undef copysign
-#endif
-
-#include <algorithm>
-#include <cctype>
-#include <cstdlib>
-#include <iterator>
-#include <map>
-#include <memory>
-#include <mutex>  // NOLINT // for call_once
-#include <string>
-#include <tuple>
-#include <type_traits>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/custom_operator.h"
-#include "paddle/fluid/framework/data_layout.h"
-#include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/executor_cache.h"
-#include "paddle/fluid/framework/executor_gc_helper.h"
-#include "paddle/fluid/framework/feed_fetch_method.h"
-#include "paddle/fluid/framework/feed_fetch_type.h"
-#include "paddle/fluid/framework/garbage_collector.h"
-#include "paddle/fluid/framework/io/fs.h"
-#include "paddle/fluid/framework/ir/coalesce_grad_tensor_pass.h"
-#include "paddle/fluid/framework/ir/cost_model.h"
-#include "paddle/fluid/framework/ir/generate_pass.h"
-#include "paddle/fluid/framework/ir/pass_builder.h"
-#include "paddle/fluid/framework/lod_rank_table.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/new_executor/executor_statistics.h"
-#include "paddle/fluid/framework/new_executor/standalone_executor.h"
-#include "paddle/fluid/framework/op_info.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/framework/parallel_executor.h"
-#include "paddle/fluid/framework/phi_utils.h"
-#include "paddle/fluid/framework/prune.h"
-#include "paddle/fluid/framework/reader.h"
-#include "paddle/fluid/framework/scope_pool.h"
-#include "paddle/fluid/framework/selected_rows_utils.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/framework/trainer.h"
-#include "paddle/fluid/framework/type_defs.h"
-#include "paddle/fluid/framework/version.h"
-#include "paddle/fluid/imperative/amp_auto_cast.h"
-#include "paddle/fluid/imperative/layer.h"
-#include "paddle/fluid/memory/allocation/allocator_strategy.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/memory/allocation/cuda_ipc_allocator.h"
-#endif
-#include "paddle/fluid/memory/allocation/mmap_allocator.h"
-#include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/operators/common_infer_shape_functions.h"
-#include "paddle/fluid/platform/cpu_helper.h"
-#include "paddle/fluid/platform/device/device_wrapper.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/init.h"
-#include "paddle/fluid/platform/monitor.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/profiler.h"
-#include "paddle/fluid/platform/profiler/event_python.h"
-#include "paddle/fluid/platform/profiler/event_tracing.h"
-#include "paddle/fluid/platform/profiler/profiler.h"
-#include "paddle/fluid/pybind/bind_cost_model.h"
-#include "paddle/fluid/pybind/bind_fleet_executor.h"
-#include "paddle/fluid/pybind/box_helper_py.h"
-#include "paddle/fluid/pybind/communication.h"
-#include "paddle/fluid/pybind/compatible.h"
-#include "paddle/fluid/pybind/const_value.h"
-#include "paddle/fluid/pybind/cuda_streams_py.h"
-#include "paddle/fluid/pybind/data_set_py.h"
-#include "paddle/fluid/pybind/distributed_py.h"
-#include "paddle/fluid/pybind/eager.h"
-#include "paddle/fluid/pybind/exception.h"
-#include "paddle/fluid/pybind/fleet_wrapper_py.h"
-#include "paddle/fluid/pybind/generator_py.h"
-#include "paddle/fluid/pybind/global_value_getter_setter.h"
-#include "paddle/fluid/pybind/gloo_context_py.h"
-#include "paddle/fluid/pybind/gloo_wrapper_py.h"
-#include "paddle/fluid/pybind/graph.h"
-#include "paddle/fluid/pybind/heter_wrapper_py.h"
-#include "paddle/fluid/pybind/imperative.h"
-#include "paddle/fluid/pybind/inference_api.h"
-#include "paddle/fluid/pybind/io.h"
-#include "paddle/fluid/pybind/metrics_py.h"
-#include "paddle/fluid/pybind/ps_gpu_wrapper_py.h"
-#include "paddle/fluid/pybind/pybind_variant_caster.h"
-#include "paddle/phi/backends/cpu/cpu_info.h"
-#include "paddle/phi/backends/device_manager.h"
-#include "paddle/phi/core/compat/convert_utils.h"
-#include "paddle/phi/core/lod_utils.h"
-#include "paddle/utils/none.h"
-
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/pybind/nccl_wrapper_py.h"
-#endif
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/pybind/protobuf.h"
-#include "paddle/fluid/pybind/pybind.h"  // NOLINT
-#include "paddle/fluid/pybind/reader_py.h"
-#include "paddle/fluid/pybind/tensor_py.h"
-#include "paddle/utils/string/to_string.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
-#endif
-#ifndef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h"
-#endif
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#endif
-
-#ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/platform/device/xpu/xpu_info.h"
-#include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
-#endif
-
-#ifdef PADDLE_WITH_CUSTOM_DEVICE
-#include "paddle/phi/capi/capi.h"
-#endif
-
-#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
-
-#ifdef PADDLE_WITH_IPU
-#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
-#include "paddle/fluid/platform/device/ipu/ipu_info.h"
-#endif
-
-#ifdef PADDLE_WITH_CRYPTO
-#include "paddle/fluid/pybind/crypto.h"
-#endif
-
-#if defined PADDLE_WITH_PSCORE
-#include "paddle/fluid/pybind/fleet_py.h"
-#endif
-
-#ifdef PADDLE_WITH_CINN
-#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
-#endif
-
-#include "paddle/common/flags.h"
-#include "paddle/fluid/eager/api/utils/global_utils.h"
-#include "paddle/fluid/imperative/layout_autotune.h"
-#include "paddle/fluid/pybind/eager_utils.h"
-#include "paddle/fluid/pybind/parallel_executor.h"
-#include "paddle/phi/api/ext/op_meta_info.h"
-#include "paddle/phi/kernels/autotune/cache.h"
-#include "paddle/phi/kernels/autotune/switch_autotune.h"
-#include "pybind11/stl.h"
-
-COMMON_DECLARE_bool(use_mkldnn);
-
-// disable auto conversion to list in Python
-PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
-PYBIND11_MAKE_OPAQUE(paddle::framework::FetchUnmergedList);
-PYBIND11_MAKE_OPAQUE(paddle::framework::FetchList);
-PYBIND11_MAKE_OPAQUE(paddle::framework::FetchType);
-
-namespace paddle {
-namespace pybind {
-using namespace paddle::framework;                // NOLINT
-void BindParallelExecutor(pybind11::module &m) {  // NOLINT
-  // -- python binds for parallel executor.
-  py::class_<ParallelExecutor> pe(m, "ParallelExecutor");
-  py::class_<ExecutionStrategy> exec_strategy(pe, "ExecutionStrategy");
-
-  py::enum_<paddle::platform::DeviceType>(m, "DeviceType", py::arithmetic())
-      .value("CPU", paddle::platform::DeviceType::CPU)
-      .value("CUDA", paddle::platform::DeviceType::CUDA)
-      .value("XPU", paddle::platform::DeviceType::XPU);
-
-  exec_strategy.def(py::init())
-      .def_property(
-          "num_threads",
-          [](const ExecutionStrategy &self) { return self.num_threads_; },
-          [](ExecutionStrategy &self, size_t num_threads) {
-            self.num_threads_ = num_threads;
-          })
-      .def_property(
-          "_use_device",
-          [](const ExecutionStrategy &self) { return self.use_device_; },
-          [](ExecutionStrategy &self, paddle::platform::DeviceType use_device) {
-            self.use_device_ = use_device;
-          })  // NOTE(liuyuhui): Doesn't add doc for 'use_device', because
-              // use_device isn‘t exposed to users.
-      .def_property(
-          "allow_op_delay",
-          [](const ExecutionStrategy &self) { return self.allow_op_delay_; },
-          [](ExecutionStrategy &self, bool allow_op_delay) {
-            self.allow_op_delay_ = allow_op_delay;
-          })
-      .def_property(
-          "num_iteration_per_drop_scope",
-          [](const ExecutionStrategy &self) {
-            return self.num_iteration_per_drop_scope_;
-          },
-          [](ExecutionStrategy &self, size_t num_iteration_per_drop_scope) {
-            self.num_iteration_per_drop_scope_ = num_iteration_per_drop_scope;
-          })
-      .def_property(
-          "num_iteration_per_run",
-          [](const ExecutionStrategy &self) {
-            return self.num_iteration_per_run_;
-          },
-          [](ExecutionStrategy &self, size_t num_iteration_per_run) {
-            self.num_iteration_per_run_ = num_iteration_per_run;
-          })
-      .def_property(
-          "use_thread_barrier",
-          [](const ExecutionStrategy &self) { return self.thread_barrier_; },
-          [](ExecutionStrategy &self, bool use_thread_barrier) {
-            self.thread_barrier_ = use_thread_barrier;
-          })
-      .def_property(
-          "_dry_run",
-          [](const ExecutionStrategy &self) { return self.dry_run_; },
-          [](ExecutionStrategy &self, bool dry_run) {
-            self.dry_run_ = dry_run;
-          });
-
-  exec_strategy.def_property(
-      "use_experimental_executor",
-      [](const ExecutionStrategy &self) {
-        return self.type_ == ExecutionStrategy::kExperimental;
-      },
-      [](ExecutionStrategy &self, bool experimental) {
-        self.type_ = experimental ? ExecutionStrategy::kExperimental
-                                  : ExecutionStrategy::kDefault;
-      });
-
-  py::class_<BuildStrategy> build_strategy(pe, "BuildStrategy", R"DOC(
-    BuildStrategy allows the user to more preciously control how to
-    build the SSA Graph in ParallelExecutor by setting the property.
-
-    Returns:
-        BuildStrategy: An BuildStrategy object.
-
-    Examples:
-        .. code-block:: python
-
-            >>> import paddle
-            >>> import paddle.static as static
-
-            >>> paddle.enable_static()
-
-            >>> data = static.data(name="x", shape=[None, 1], dtype="float32")
-            >>> hidden = static.nn.fc(data, size=10)
-            >>> loss = paddle.mean(hidden)
-            >>> paddle.optimizer.SGD(learning_rate=0.01).minimize(loss)
-
-            >>> build_strategy = static.BuildStrategy()
-            >>> build_strategy.enable_inplace = True
-            >>> build_strategy.memory_optimize = True
-            >>> build_strategy.reduce_strategy = static.BuildStrategy.ReduceStrategy.Reduce
-            >>> program = static.CompiledProgram(static.default_main_program(), build_strategy=build_strategy)
-)DOC");
-
-  py::enum_<BuildStrategy::ReduceStrategy>(build_strategy, "ReduceStrategy")
-      .value("Reduce", BuildStrategy::ReduceStrategy::kReduce)
-      .value("AllReduce", BuildStrategy::ReduceStrategy::kAllReduce)
-      .value("_NoReduce", BuildStrategy::ReduceStrategy::kNoReduce);
-  py::enum_<BuildStrategy::GradientScaleStrategy>(build_strategy,
-                                                  "GradientScaleStrategy")
-      .value("CoeffNumDevice",
-             BuildStrategy::GradientScaleStrategy::kCoeffNumDevice)
-      .value("One", BuildStrategy::GradientScaleStrategy::kOne)
-      .value("Customized", BuildStrategy::GradientScaleStrategy::kCustomized);
-
-  build_strategy.def(py::init())
-      .def("_clear_finalized", &BuildStrategy::ClearFinalized)
-      .def_property(
-          "reduce_strategy",
-          [](const BuildStrategy &self) { return self.reduce_; },
-          [](BuildStrategy &self, BuildStrategy::ReduceStrategy strategy) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finalized, cannot be "
-                                  "configured again."));
-            self.reduce_ = strategy;
-          },
-          R"DOC((fluid.BuildStrategy.ReduceStrategy, optional): there are two reduce
-                strategies in ParallelExecutor, AllReduce and Reduce. If you want
-                that all the parameters' optimization are done on all devices independently,
-                you should choose AllReduce; otherwise, if you choose Reduce, all the parameters'
-                optimization will be evenly distributed to different devices, and then
-                broadcast the optimized parameter to other devices.
-                Default is 'AllReduce'.
-
-                Examples:
-                    .. code-block:: python
-
-                        >>> import paddle
-                        >>> import paddle.static as static
-
-                        >>> paddle.enable_static()
-
-                        >>> build_strategy = static.BuildStrategy()
-                        >>> build_strategy.reduce_strategy = static.BuildStrategy.ReduceStrategy.Reduce
-          )DOC")
-      .def_property(
-          "gradient_scale_strategy",
-          [](const BuildStrategy &self) { return self.gradient_scale_; },
-          [](BuildStrategy &self,
-             BuildStrategy::GradientScaleStrategy strategy) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finalized, cannot be "
-                                  "configured again."));
-            self.gradient_scale_ = strategy;
-          },
-          R"DOC((paddle.static.BuildStrategy.GradientScaleStrategy, optional): there are three
-                ways of defining :math:`loss@grad` in ParallelExecutor, that is, CoeffNumDevice,
-                One and Customized. By default, ParallelExecutor sets the :math:`loss@grad`
-                according to the number of devices. If you want to customize :math:`loss@grad`,
-                you can choose Customized. Default is 'CoeffNumDevice'.
-
-                Examples:
-                    .. code-block:: python
-
-                        >>> import numpy
-                        >>> import paddle
-                        >>> import paddle.static as static
-
-                        >>> paddle.enable_static()
-
-                        >>> use_cuda = paddle.device.is_compiled_with_cuda
-                        >>> place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
-                        >>> exe = static.Executor(place)
-
-                        >>> data = static.data(name='X', shape=[None, 1], dtype='float32')
-                        >>> hidden = static.nn.fc(data, size=10)
-                        >>> loss = paddle.mean(hidden)
-                        >>> paddle.optimizer.SGD(learning_rate=0.01).minimize(loss)
-
-                        >>> exe.run(static.default_startup_program())
-
-                        >>> build_strategy = static.BuildStrategy()
-                        >>> build_strategy.gradient_scale_strategy = \
-                        ...             static.BuildStrategy.GradientScaleStrategy.Customized
-                        >>> compiled_prog = static.CompiledProgram(
-                        ...             static.default_main_program(),
-                        ...             build_strategy=build_strategy,
-                        >>> )
-
-                        >>> x = numpy.random.random(size=(10, 1)).astype('float32')
-                        >>> loss_grad = numpy.ones((1)).astype("float32") * 0.01
-                        >>> loss_grad_name = loss.name+"@GRAD"
-                        >>> loss_data = exe.run(compiled_prog,
-                        ...                         feed={"X": x, loss_grad_name : loss_grad},
-                        ...                         fetch_list=[loss.name, loss_grad_name])
-          )DOC")
-      .def_property(
-          "debug_graphviz_path",
-          [](const BuildStrategy &self) { return self.debug_graphviz_path_; },
-          [](BuildStrategy &self, const std::string &path) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finalized, cannot be "
-                                  "configured again."));
-            self.debug_graphviz_path_ = path;
-          },
-          R"DOC((str, optional): debug_graphviz_path indicates the path that
-                writing the SSA Graph to file in the form of graphviz.
-                It is useful for debugging. Default is empty string, that is, ""
-
-                Examples:
-                    .. code-block:: python
-
-                        >>> import paddle
-                        >>> import paddle.static as static
-
-                        >>> paddle.enable_static()
-
-                        >>> build_strategy = static.BuildStrategy()
-                        >>> build_strategy.debug_graphviz_path = "./graph"
-          )DOC")
-      .def_property(
-          "enable_sequential_execution",
-          [](const BuildStrategy &self) {
-            return self.enable_sequential_execution_;
-          },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finalized, cannot be "
-                                  "configured again."));
-            self.enable_sequential_execution_ = b;
-          },
-          R"DOC((bool, optional): If set True, the execution order of ops would
-                be the same as what is in the program. Default is False.
-
-                Examples:
-                    .. code-block:: python
-
-                        >>> import paddle
-                        >>> import paddle.static as static
-
-                        >>> paddle.enable_static()
-
-                        >>> build_strategy = static.BuildStrategy()
-                        >>> build_strategy.enable_sequential_execution = True
-          )DOC")
-      .def_property(
-          "remove_unnecessary_lock",
-          [](const BuildStrategy &self) {
-            return self.remove_unnecessary_lock_;
-          },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finalized, cannot be "
-                                  "configured again."));
-            self.remove_unnecessary_lock_ = b;
-          },
-          R"DOC((bool, optional): If set True, some locks in GPU ops would be
-                released and ParallelExecutor would run faster. Default is True.
-
-                Examples:
-                    .. code-block:: python
-
-                        >>> import paddle
-                        >>> import paddle.static as static
-
-                        >>> paddle.enable_static()
-
-                        >>> build_strategy = static.BuildStrategy()
-                        >>> build_strategy.remove_unnecessary_lock = True
-          )DOC")
-      .def_property(
-          "num_trainers",
-          [](const BuildStrategy &self) { return self.num_trainers_; },
-          [](BuildStrategy &self, int num_trainers) {
-#ifdef WIN32
-            PADDLE_THROW(platform::errors::Unavailable(
-                "Distribution mode is not supported on Windows platform."));
-#endif
-            self.num_trainers_ = num_trainers;
-          })
-      .def_property(
-          "trainers_endpoints",
-          [](const BuildStrategy &self) { return self.trainers_endpoints_; },
-          [](BuildStrategy &self,
-             const std::vector<std::string> &trainers_endpoints) {
-            self.trainers_endpoints_ = trainers_endpoints;
-          })
-      .def_property(
-          "trainer_id",
-          [](const BuildStrategy &self) { return self.trainer_id_; },
-          [](BuildStrategy &self, int trainer_id) {
-            self.trainer_id_ = trainer_id;
-          })
-      .def_property(
-          "nccl_comm_num",
-          [](const BuildStrategy &self) { return self.nccl_comm_num_; },
-          [](BuildStrategy &self, int nccl_comm_num) {
-            self.nccl_comm_num_ = nccl_comm_num;
-          })
-      .def_property(
-          "bkcl_comm_num",
-          [](const BuildStrategy &self) { return self.bkcl_comm_num_; },
-          [](BuildStrategy &self, int bkcl_comm_num) {
-            self.bkcl_comm_num_ = bkcl_comm_num;
-          })
-      .def_property(
-          "use_hierarchical_allreduce",
-          [](const BuildStrategy &self) {
-            return self.use_hierarchical_allreduce_;
-          },
-          [](BuildStrategy &self, bool use) {
-            self.use_hierarchical_allreduce_ = use;
-          })
-      .def_property(
-          "hierarchical_allreduce_inter_nranks",
-          [](const BuildStrategy &self) {
-            return self.hierarchical_allreduce_inter_nranks_;
-          },
-          [](BuildStrategy &self, int nranks) {
-            self.hierarchical_allreduce_inter_nranks_ = nranks;
-          })
-      .def_property(
-          "build_cinn_pass",
-          [](const BuildStrategy &self) { return self.build_cinn_pass_; },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finalized, "
-                                  "cannot be configured again."));
-            self.build_cinn_pass_ = b;
-          },
-          R"DOC((bool, optional): build_cinn_pass indicates whether
-                      to lowering some operators in graph into cinn ops
-                      to execute, which will speed up the process of execution.
-                      Default False.
-
-                      Examples:
-                            .. code-block:: python
-
-                                >>> import paddle
-                                >>> import paddle.static as static
-                                >>> paddle.enable_static()
-                                >>> build_strategy = static.BuildStrategy()
-                                >>> build_strategy.build_cinn_pass = True
-          )DOC")
-      .def_property(
-          "fuse_elewise_add_act_ops",
-          [](const BuildStrategy &self) {
-            return self.fuse_elewise_add_act_ops_;
-          },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finalized, cannot be "
-                                  "configured again."));
-            self.fuse_elewise_add_act_ops_ = b;
-          },
-          R"DOC((bool, optional): fuse_elewise_add_act_ops indicate whether
-                to fuse elementwise_add_op and activation_op,
-                it may make the execution faster. Default is False.
-
-                Examples:
-                    .. code-block:: python
-
-                        >>> import paddle
-                        >>> import paddle.static as static
-
-                        >>> paddle.enable_static()
-
-                        >>> build_strategy = static.BuildStrategy()
-                        >>> build_strategy.fuse_elewise_add_act_ops = True
-          )DOC")
-      .def_property(
-          "fuse_gemm_epilogue",
-          [](const BuildStrategy &self) { return self.fuse_gemm_epilogue_; },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finalized, cannot be "
-                                  "configured again."));
-            self.fuse_gemm_epilogue_ = b;
-          },
-          R"DOC((bool, optional): fuse_gemm_epilogue indicate whether
-                to fuse matmul_op, elemenewist_add_op and activation_op,
-                it may make the execution faster. Default is False.
-
-                Examples:
-                    .. code-block:: python
-
-                        >>> import paddle
-                        >>> import paddle.static as static
-
-                        >>> paddle.enable_static()
-
-                        >>> build_strategy = static.BuildStrategy()
-                        >>> build_strategy.fuse_gemm_epilogue = True
-          )DOC")
-      .def_property(
-          "fuse_dot_product_attention",
-          [](const BuildStrategy &self) {
-            return self.fuse_dot_product_attention_;
-          },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finlaized, cannot be "
-                                  "configured again."));
-            self.fuse_dot_product_attention_ = b;
-          },
-          R"DOC((bool, optional): fuse_dot_product_attention indicate whether
-                to fuse dot product attention,
-                it would make the execution faster. Default is False.
-
-                Examples:
-                    .. code-block:: python
-
-                        import paddle
-                        import paddle.static as static
-
-                        paddle.enable_static()
-
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.fuse_dot_product_attention = True
-                     )DOC")
-      .def_property(
-          "fuse_adamw",
-          [](const BuildStrategy &self) { return self.fuse_adamw_; },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finalized, cannot be "
-                                  "configured again."));
-            self.fuse_adamw_ = b;
-          },
-          R"DOC((bool, optional): fuse_adamw indicate whether
-                to fuse all adamw optimizers with multi_tensor_adam,
-                it may make the execution faster. Default is False.
-                Examples:
-                    .. code-block:: python
-
-                        >>> import paddle
-                        >>> import paddle.static as static
-                        >>> paddle.enable_static()
-                        >>> build_strategy = static.BuildStrategy()
-                        >>> build_strategy.fuse_adamw = True
-          )DOC")
-      .def_property(
-          "fused_attention",
-          [](const BuildStrategy &self) { return self.fused_attention_; },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finalized, cannot be "
-                                  "configured again."));
-            self.fused_attention_ = b;
-          },
-          R"DOC((bool, optional): fused_attention indicate whether
-                to fuse the whole multi head attention part with one op,
-                it may make the execution faster. Default is False.
-
-                Examples:
-                    .. code-block:: python
-
-                        >>> import paddle
-                        >>> import paddle.static as static
-
-                        >>> paddle.enable_static()
-
-                        >>> build_strategy = static.BuildStrategy()
-                        >>> build_strategy.fused_attention = True
-          )DOC")
-      .def_property(
-          "fused_feedforward",
-          [](const BuildStrategy &self) { return self.fused_feedforward_; },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finalized, cannot be "
-                                  "configured again."));
-            self.fused_feedforward_ = b;
-          },
-          R"DOC((bool, optional): fused_feedforward indicate whether
-                to fuse the whole feed_forward part with one op,
-                it may make the execution faster. Default is False.
-
-                Examples:
-                    .. code-block:: python
-
-                        >>> import paddle
-                        >>> import paddle.static as static
-
-                        >>> paddle.enable_static()
-
-                        >>> build_strategy = static.BuildStrategy()
-                        >>> build_strategy.fused_feedforward = True
-          )DOC")
-      .def_property(
-          "sequential_run",
-          [](const BuildStrategy &self) { return self.sequential_run_; },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finalized, cannot be "
-                                  "configured again."));
-            self.sequential_run_ = b;
-          },
-          R"DOC((bool, optional): sequential_run is used to let the `StandaloneExecutor` run ops by the
-          order of `ProgramDesc`. Default is False.
-
-                Examples:
-                    .. code-block:: python
-
-                        >>> import paddle
-                        >>> import paddle.static as static
-
-                        >>> paddle.enable_static()
-
-                        >>> build_strategy = static.BuildStrategy()
-                        >>> build_strategy.sequential_run = True
-          )DOC")
-      .def_property(
-          "fuse_resunit",
-          [](const BuildStrategy &self) { return self.fuse_resunit_; },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finalized, cannot be "
-                                  "configured again."));
-            self.fuse_resunit_ = b;
-#ifndef PADDLE_WITH_CUDNN_FRONTEND
-            if (self.fuse_resunit_) {
-              PADDLE_THROW(platform::errors::PreconditionNotMet(
-                  "Paddle is not built with CUDNN Frontend support."));
-            }
-#endif
-          },
-          R"DOC((bool, optional): fuse_resunit Default is False.
-
-                Examples:
-                    .. code-block:: python
-
-                        import paddle
-                        import paddle.static as static
-
-                        paddle.enable_static()
-
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.fuse_resunit = True
-                     )DOC")
-      .def_property(
-          "fuse_bn_act_ops",
-          [](const BuildStrategy &self) { return self.fuse_bn_act_ops_; },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finalized, cannot be "
-                                  "configured again."));
-            self.fuse_bn_act_ops_ = b;
-          },
-          R"DOC((bool, optional): fuse_bn_act_ops indicate whether
-                to fuse batch_norm and activation_op,
-                it may make the execution faster. Default is False.
-
-                Examples:
-                    .. code-block:: python
-
-                        >>> import paddle
-                        >>> import paddle.static as static
-
-                        >>> paddle.enable_static()
-
-                        >>> build_strategy = static.BuildStrategy()
-                        >>> build_strategy.fuse_bn_act_ops = True
-          )DOC")
-      .def_property(
-          "fuse_bn_add_act_ops",
-          [](const BuildStrategy &self) { return self.fuse_bn_add_act_ops_; },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finalized, cannot be "
-                                  "configured again."));
-            self.fuse_bn_add_act_ops_ = b;
-          },
-          R"DOC((bool, optional): fuse_bn_add_act_ops indicate whether
-                to fuse batch_norm, elementwise_add and activation_op,
-                it may make the execution faster. Default is True
-
-                Examples:
-                    .. code-block:: python
-
-                        >>> import paddle
-                        >>> import paddle.static as static
-
-                        >>> paddle.enable_static()
-
-                        >>> build_strategy = static.BuildStrategy()
-                        >>> build_strategy.fuse_bn_add_act_ops = True
-          )DOC")
-      .def_property(
-          "enable_auto_fusion",
-          [](const BuildStrategy &self) { return self.enable_auto_fusion_; },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finalized, cannot be "
-                                  "configured again."));
-            self.enable_auto_fusion_ = b;
-          },
-          R"DOC((bool, optional): Whether to enable fusing subgraph to a
-                fusion_group. Now we only support fusing subgraph that composed
-                of elementwise-like operators, such as elementwise_add/mul
-                without broadcast and activations.
-
-                Examples:
-                    .. code-block:: python
-
-                        >>> import paddle
-                        >>> import paddle.static as static
-
-                        >>> paddle.enable_static()
-
-                        >>> build_strategy = static.BuildStrategy()
-                        >>> build_strategy.enable_auto_fusion = True
-          )DOC")
-      .def_property(
-          "fuse_relu_depthwise_conv",
-          [](const BuildStrategy &self) {
-            return self.fuse_relu_depthwise_conv_;
-          },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finalized, cannot be "
-                                  "configured again."));
-            self.fuse_relu_depthwise_conv_ = b;
-          },
-          R"DOC((bool, optional): fuse_relu_depthwise_conv indicate whether
-                to fuse relu and depthwise_conv2d,
-                it will save GPU memory and may make the execution faster.
-                This options is only available in GPU devices.
-                Default is False.
-
-                Examples:
-                    .. code-block:: python
-
-                        >>> import paddle
-                        >>> import paddle.static as static
-
-                        >>> paddle.enable_static()
-
-                        >>> build_strategy = static.BuildStrategy()
-                        >>> build_strategy.fuse_relu_depthwise_conv = True
-          )DOC")
-      .def_property(
-          "fuse_broadcast_ops",
-          [](const BuildStrategy &self) {
-            return self.fuse_broadcast_ops_ == true ||
-                   self.fuse_broadcast_ops_ == paddle::none;
-          },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finalized, "
-                                  "cannot be configured again."));
-            self.fuse_broadcast_ops_ = b;
-          },
-          R"DOC((bool, optional): fuse_broadcast_op indicates whether
-                      to fuse the broadcast ops. Note that, in Reduce mode,
-                      fusing broadcast ops may make the program faster. Because
-                      fusing broadcast OP equals delaying the execution of all
-                      broadcast Ops, in this case, all nccl streams are used only
-                      for NCCLReduce operations for a period of time. Default False.
-
-                      Examples:
-                            .. code-block:: python
-
-                                >>> import paddle
-                                >>> import paddle.static as static
-                                >>> paddle.enable_static()
-
-                                >>> build_strategy = static.BuildStrategy()
-                                >>> build_strategy.fuse_broadcast_ops = True
-          )DOC")
-      .def_property(
-          "fuse_all_optimizer_ops",
-          [](const BuildStrategy &self) {
-            return self.fuse_all_optimizer_ops_ == true ||
-                   self.fuse_all_optimizer_ops_ == paddle::none;
-          },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finalized, "
-                                  "cannot be configured again."));
-            self.fuse_all_optimizer_ops_ = b;
-          })
-      .def_property(
-          "sync_batch_norm",
-          [](const BuildStrategy &self) { return self.sync_batch_norm_; },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finalized, cannot be "
-                                  "configured again."));
-            self.sync_batch_norm_ = b;
-          },
-          R"DOC((bool, optional): sync_batch_norm indicates whether to use
-                synchronous batch normalization which synchronizes the mean
-                and variance through multi-devices in training phase.
-                Current implementation doesn't support FP16 training and CPU.
-                And only synchronous on one machine, not all machines.
-                Default is False.
-
-                Examples:
-                    .. code-block:: python
-
-                        >>> import paddle
-                        >>> import paddle.static as static
-
-                        >>> paddle.enable_static()
-
-                        >>> build_strategy = static.BuildStrategy()
-                        >>> build_strategy.sync_batch_norm = True
-          )DOC")
-      .def_property(
-          "memory_optimize",
-          [](const BuildStrategy &self) -> py::object {
-            if (self.memory_optimize_) {  // NOLINT
-              return py::cast(self.memory_optimize_.get());
-            } else {
-              return py::cast(nullptr);
-            }
-          },
-          [](BuildStrategy &self, const py::handle &value) {
-            auto *py_obj = value.ptr();
-            if (py_obj == nullptr || py_obj == Py_None) {
-              self.memory_optimize_ = paddle::none;
-            } else if (PyBool_Check(py_obj)) {
-              self.memory_optimize_ = (py_obj == Py_True);
-            } else {
-              PADDLE_THROW(platform::errors::InvalidArgument(
-                  "BuildStrategy.memory_optimize must be set to None, False "
-                  "or True"));
-            }
-          },
-          R"DOC((bool, optional): memory opitimize aims to save total memory
-                consumption, set to True to enable it.
-
-                Default None. None means framework would choose to use or not use
-                this strategy automatically. Currently, None means that it is
-                enabled when GC is disabled, and disabled when GC is enabled.
-                True means enabling and False means disabling. Default is None.
-
-                Examples:
-                    .. code-block:: python
-
-                        >>> import paddle
-                        >>> import paddle.static as static
-
-                        >>> paddle.enable_static()
-
-                        >>> build_strategy = static.BuildStrategy()
-                        >>> build_strategy.memory_optimize = True
-
-          )DOC")
-      .def_property(
-          "is_distribution",
-          [](const BuildStrategy &self) { return self.is_distribution_; },
-          [](BuildStrategy &self, bool b) {
-#ifdef WIN32
-            if (b) {
-              PADDLE_THROW(platform::errors::Unavailable(
-                  "Distribution mode is not supported on Windows platform."));
-            }
-#else
-            self.is_distribution_ = b;
-#endif
-          })
-      .def_property(
-          "async_mode",
-          [](const BuildStrategy &self) { return self.async_mode_; },
-          [](BuildStrategy &self, bool b) { self.async_mode_ = b; })
-      .def_property(
-          "enable_inplace",
-          [](const BuildStrategy &self) { return self.enable_inplace_; },
-          [](BuildStrategy &self, bool b) { self.enable_inplace_ = b; })
-      .def_property(
-          "enable_addto",
-          [](const BuildStrategy &self) { return self.enable_addto_; },
-          [](BuildStrategy &self, bool b) { self.enable_addto_ = b; })
-      .def_property(
-          "fuse_all_reduce_ops",
-          [](const BuildStrategy &self) {
-            return self.fuse_all_reduce_ops_ == true ||
-                   self.fuse_all_reduce_ops_ == paddle::none;
-          },
-          [](BuildStrategy &self, bool b) { self.fuse_all_reduce_ops_ = b; })
-      .def_property(
-          "enable_backward_optimizer_op_deps",
-          [](const BuildStrategy &self) {
-            return self.enable_backward_optimizer_op_deps_;
-          },
-          [](BuildStrategy &self, bool b) {
-            self.enable_backward_optimizer_op_deps_ = b;
-          })
-      .def_property(
-          "cache_runtime_context",
-          [](const BuildStrategy &self) { return self.cache_runtime_context_; },
-          [](BuildStrategy &self, bool b) { self.cache_runtime_context_ = b; })
-      .def_property(
-          "mkldnn_enabled_op_types",
-          [](const BuildStrategy &self) {
-            return self.mkldnn_enabled_op_types_;
-          },
-          [](BuildStrategy &self,
-             const std::unordered_set<std::string> &mkldnn_enabled_op_types) {
-            self.mkldnn_enabled_op_types_ = mkldnn_enabled_op_types;
-          })
-      .def_property(
-          "fix_op_run_order",
-          [](const BuildStrategy &self) { return self.fix_op_run_order_; },
-          [](BuildStrategy &self, bool fix_op_run_order) {
-            self.fix_op_run_order_ = fix_op_run_order;
-          })
-      .def_property(
-          "allow_cuda_graph_capture",
-          [](const BuildStrategy &self) {
-            return self.allow_cuda_graph_capture_;
-          },
-          [](BuildStrategy &self, bool allow_cuda_graph_capture) {
-            self.allow_cuda_graph_capture_ = allow_cuda_graph_capture;
-          })
-      .def("_copy",
-           [](const BuildStrategy &self) {
-             auto new_bs = self;
-             new_bs.ClearFinalized();
-             return new_bs;
-           })
-      .def("__str__",
-           [](const BuildStrategy &self) {
-             std::stringstream ss;
-             ss << self;
-             return ss.str();
-           })
-      .def(
-          "_finalize_strategy_and_create_passes",
-          [](BuildStrategy &self) -> std::shared_ptr<ir::PassBuilder> {
-            return self.CreatePassesFromStrategy(true);
-          },
-          R"DOC(Allow user to customized passes. Normally model-specific
-                optimization passes should be defined in this way. BuildStrategy
-                cannot be updated after being finalized.)DOC");
-
-  m.def("_set_cached_executor_build_strategy",
-        [](int64_t program_id, const BuildStrategy &build_strategy) {
-          auto &cached_exe_info = framework::ExecutorInfoCache::Instance();
-          cached_exe_info.SetBuildStrategy(program_id, build_strategy);
-        });
-
-  pe.def(py::init<const std::vector<platform::Place> &,
-                  const std::vector<std::string> &,
-                  const std::string &,
-                  Scope *,
-                  std::vector<Scope *> &,
-                  const ExecutionStrategy &,
-                  const BuildStrategy &,
-                  ir::Graph *>())
-      // NOTE: even we return a vec<Scope*>* to Python use reference policy.
-      // We still cannot get local_scope from this vector, since the element
-      // of vec<Scope*> will be freed by Python GC. We can only return Scope*
-      // one by one and mark them as reference.
-      .def(
-          "local_scopes",
-          [](ParallelExecutor &self) -> std::vector<Scope *> * {
-            return &self.GetLocalScopes();
-          },
-          py::return_value_policy::reference)
-      .def("drop_local_exe_scopes", &ParallelExecutor::DropLocalExeScopes)
-      .def("_need_create_local_exe_scopes",
-           &ParallelExecutor::NeedCreateLocalExeScope)
-      .def("feed_tensors_into_local_scopes",
-           &ParallelExecutor::FeedTensorsIntoLocalScopes)
-      .def("feed_and_split_tensor_into_local_scopes",
-           &ParallelExecutor::FeedAndSplitTensorIntoLocalScopes)
-      .def("run",
-           [](ParallelExecutor &self,
-              const std::vector<std::string> &fetch_tensors,
-              bool return_merged) -> py::object {
-             if (return_merged) {
-               paddle::framework::FetchList ret;
-               /*gil_scoped_release*/ {
-                 pybind11::gil_scoped_release release;
-                 ret = self.RunAndMerge(fetch_tensors);
-               }
-               return py::cast(std::move(ret));
-             } else {
-               paddle::framework::FetchUnmergedList ret;
-               /*gil_scoped_release*/ {
-                 pybind11::gil_scoped_release release;
-                 ret = self.Run(fetch_tensors);
-               }
-               return py::cast(std::move(ret));
-             }
-           })
-      .def("device_count", &ParallelExecutor::DeviceCount);
-  using VarQuantScale =
-      std::unordered_map<std::string, std::pair<bool, phi::DenseTensor>>;
-  py::class_<ir::Pass, std::shared_ptr<ir::Pass>> pass(m, "Pass");
-  pass.def(py::init())
-      .def("has", &ir::Pass::Has)
-      .def("set_not_owned",
-           [](ir::Pass &self, const std::string &attr_name, ProgramDesc &attr) {
-             self.SetNotOwned<ProgramDesc>(attr_name, &attr);
-           })
-      .def(
-          "set",
-          [](ir::Pass &self, const std::string &name, const std::string &attr) {
-            self.Set<std::string>(name, new std::string(attr));
-          })
-      .def("set",
-           [](ir::Pass &self, const std::string &name, bool val) {
-             self.Set<bool>(name, new bool(val));
-           })
-      .def("set",
-           [](ir::Pass &self, const std::string &name, int val) {
-             self.Set<const int>(name, new int(val));
-           })
-      .def("set",
-           [](ir::Pass &self,
-              const std::string &name,
-              std::vector<std::string> set) {
-             self.Set(name, new std::vector<std::string>(set));
-           })
-      .def("set",
-           [](ir::Pass &self,
-              const std::string &name,
-              std::unordered_set<std::string> set) {
-             self.Set(name, new std::unordered_set<std::string>(set));
-           })
-      .def("set",
-           [](ir::Pass &self,
-              const std::string &name,
-              std::unordered_set<int> set) {
-             self.Set(name, new std::unordered_set<int>(set));
-           })
-      .def("set",
-           [](ir::Pass &self, const std::string &name, VarQuantScale scales) {
-             self.Set(name, new VarQuantScale(scales));
-           })
-      .def("type", &ir::Pass::Type)
-      .def("apply", [](ir::Pass &self, std::shared_ptr<ir::Graph> graph) {
-        self.Apply(graph.get());
-      });
-
-  py::class_<ir::PassBuilder, std::shared_ptr<ir::PassBuilder>> pb(
-      m, "PassBuilder");
-  pb.def(py::init())
-      .def("append_pass",
-           [](ir::PassBuilder &self,
-              const std::string &pass_type) -> std::shared_ptr<ir::Pass> {
-             return self.AppendPass(pass_type);
-           })
-      .def("all_passes", [](ir::PassBuilder &self) { return self.AllPasses(); })
-      .def("insert_pass",
-           [](ir::PassBuilder &self, size_t idx, const std::string &pass_type) {
-             return self.InsertPass(idx, pass_type);
-           })
-      .def("remove_pass",
-           [](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); });
-}
-
-}  // namespace pybind
-}  // namespace paddle
diff --git a/paddle/fluid/pybind/pir.cc b/paddle/fluid/pybind/pir.cc
index 999475d5944d5..fcd8c12579847 100644
--- a/paddle/fluid/pybind/pir.cc
+++ b/paddle/fluid/pybind/pir.cc
@@ -116,6 +116,7 @@ using pir::Program;
 using pir::StrAttribute;
 using pir::Type;
 using pir::Value;
+using pir::VectorType;
 using pybind11::return_value_policy;
 
 COMMON_DECLARE_bool(print_ir);
@@ -411,6 +412,12 @@ void BindProgram(py::module *m) {
            [](Program &self, IrMapping &ir_mapper) {
              return Clone(self, &ir_mapper);
            })
+      .def(
+          "copy_to_block",
+          [](std::shared_ptr<Program> self,
+             pir::IrMapping &mapper,
+             Block *block) { return self->CopyToBlock(mapper, block); },
+          return_value_policy::reference)
       .def(
           "list_vars",
           [](std::shared_ptr<Program> self) {
@@ -653,9 +660,12 @@ void BindIrMapping(py::module *m) {
   ir_mapping.def(py::init<>())
       .def("look_up",
            [](IrMapping &self, Value from) { return self.Lookup(from); })
-      .def("add", [](IrMapping &self, Value from, Value to) {
-        self.Add<Value>(from, to);
-      });
+      .def("add",
+           [](IrMapping &self, Value from, Value to) {
+             self.Add<Value>(from, to);
+           })
+      .def("size",
+           [](IrMapping &self) { return self.GetMutableMap<Value>().size(); });
 }
 
 void BindCloneOptions(py::module *m) {
@@ -1320,6 +1330,13 @@ void BindType(py::module *m) {
             PADDLE_THROW(phi::errors::InvalidArgument(
                 "can't set _local_shape when building static graph"));
           })
+      .def("as_vec_type",
+           [](Type self) -> py::object {
+             if (auto vec_type = self.dyn_cast<VectorType>()) {
+               return py::cast(vec_type);
+             }
+             return py::cast<py::none>(Py_None);
+           })
       .def("__str__", [](Type &self) {
         std::ostringstream print_stream;
         print_stream << self;
@@ -1354,7 +1371,13 @@ void BindType(py::module *m) {
            }
          });
 }
-
+void BindVectorType(py::module *m) {
+  py::class_<VectorType, Type> vec_type(*m, "VectorType");
+  vec_type.def("as_list", &VectorType::data);
+  m->def("create_vec_type", [](std::vector<Type> &types) {
+    return VectorType::get(pir::IrContext::Instance(), types);
+  });
+}
 void BindAttribute(py::module *m) {
   py::class_<Attribute> ir_attr(*m, "Attribute", py::module_local());
   ir_attr.def("__eq__", &Attribute::operator==)
@@ -2486,6 +2509,7 @@ void BindPir(pybind11::module *module) {
   BindOperation(&ir_module);
   BindOpOperand(&ir_module);
   BindType(&ir_module);
+  BindVectorType(&ir_module);
   BindAttribute(&ir_module);
   BindInsertionPoint(&ir_module);
   BindUtils(&ir_module);
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index b1163adc932fc..ae49f2594ce0a 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -62,7 +62,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/framework/parallel_executor.h"
 #include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/framework/prune.h"
 #include "paddle/fluid/framework/raw_tensor.h"
@@ -146,7 +145,6 @@ limitations under the License. */
 #endif
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/pybind/compiled_program.h"
-#include "paddle/fluid/pybind/parallel_executor.h"
 #include "paddle/fluid/pybind/place.h"
 #include "paddle/fluid/pybind/protobuf.h"
 #include "paddle/fluid/pybind/pybind.h"  // NOLINT
diff --git a/paddle/phi/api/generator/dist_api_gen.py b/paddle/phi/api/generator/dist_api_gen.py
index 54605d19b256d..a8a2b23d3d026 100644
--- a/paddle/phi/api/generator/dist_api_gen.py
+++ b/paddle/phi/api/generator/dist_api_gen.py
@@ -295,7 +295,7 @@
     }}
     std::vector<phi::MetaTensor*> {name}_meta_ptr_vec({name}.size());
     for (size_t i = 0; i < {name}_meta_vec.size(); ++i) {{
-      {name}_meta_ptr_vec[i] = &{name}_meta_vec[i];
+      {name}_meta_ptr_vec[i] = {name}[i] ? &{name}_meta_vec[i] : nullptr;
     }}
 """
 INFER_GLOBAL_SHAPE_TEMPLATE = """
@@ -400,7 +400,7 @@
       std::vector<phi::MetaTensor> {name}_meta_vec = MakeMetaTensor({name});
       std::vector<phi::MetaTensor*> {name}_meta_ptr_vec({name}_meta_vec.size());
       for (size_t i = 0; i < {name}_meta_vec.size(); ++i) {{
-        {name}_meta_ptr_vec[i] = &{name}_meta_vec[i];
+        {name}_meta_ptr_vec[i] = {name}[i] ? &{name}_meta_vec[i] : nullptr;
       }}
 """
 INFER_META_TEMPLATE = """
diff --git a/paddle/phi/api/generator/dist_bw_api_gen.py b/paddle/phi/api/generator/dist_bw_api_gen.py
index 1d57d552d7767..34d495d9d0536 100644
--- a/paddle/phi/api/generator/dist_bw_api_gen.py
+++ b/paddle/phi/api/generator/dist_bw_api_gen.py
@@ -53,33 +53,41 @@
     std::shared_ptr<phi::distributed::DistTensor> shared_dist_out =
         CreateKernelDistOutput({}, !rank_is_in_current_mesh, spmd_info.second[0]);
     phi::distributed::DistTensor* dist_out = shared_dist_out.get();
-    phi::DenseTensor* dense_out = dist_out->unsafe_mutable_value();
-    if (dense_out && !rank_is_in_current_mesh && !dist_out->defined()) {{
-      *dense_out = phi::DenseTensor(
+    phi::DenseTensor* dense_out = nullptr;
+    if (dist_out) {{
+      dense_out = dist_out->unsafe_mutable_value();
+      if (dense_out && !rank_is_in_current_mesh && !dist_out->defined()) {{
+        *dense_out = phi::DenseTensor(
             std::make_shared<phi::Allocation>(nullptr, 0, phi::distributed::GetDefaultPlace()),
             phi::DenseTensorMeta());
     }}
+    }}
 """
 SINGLE_OUT_CREATION_TEMPLATE = """
     std::shared_ptr<phi::distributed::DistTensor> shared_dist_out =
         CreateKernelDistOutput({}, !rank_is_in_current_mesh);
     phi::distributed::DistTensor* dist_out = shared_dist_out.get();
-    phi::DenseTensor* dense_out = dist_out->unsafe_mutable_value();
-    if (dense_out && !rank_is_in_current_mesh && !dist_out->defined()) {{
+    phi::DenseTensor* dense_out = nullptr;
+    if (dist_out) {{
+      dense_out = dist_out->unsafe_mutable_value();
+      if (dense_out && !rank_is_in_current_mesh && !dist_out->defined()) {{
       *dense_out = phi::DenseTensor(
-            std::make_shared<phi::Allocation>(nullptr, 0, phi::distributed::GetDefaultPlace()),
-            phi::DenseTensorMeta());
+                std::make_shared<phi::Allocation>(nullptr, 0, phi::distributed::GetDefaultPlace()),
+                phi::DenseTensorMeta());
+    }}
     }}
 """
 VECTOR_OUT_CREATION_TEMPLATE_WITH_NO_SPMD = """
     auto dist_out = SetKernelDistOutput({name});
-    std::vector<phi::DenseTensor*> dense_out(dist_out.size());
+    std::vector<phi::DenseTensor*> dense_out(dist_out.size(), nullptr);
     for (size_t i=0; i<dist_out.size(); i++) {{
-      dense_out[i] = dist_out[i]->unsafe_mutable_value();
-      if (dense_out[i] && !rank_is_in_current_mesh && !dist_out[i]->defined()) {{
-        *dense_out[i] = phi::DenseTensor(
-              std::make_shared<phi::Allocation>(nullptr, 0, phi::distributed::GetDefaultPlace()),
-              phi::DenseTensorMeta());
+      if (dist_out[i]) {{
+        dense_out[i] = dist_out[i]->unsafe_mutable_value();
+        if (dense_out[i] && !rank_is_in_current_mesh && !dist_out[i]->defined()) {{
+            *dense_out[i] = phi::DenseTensor(
+                std::make_shared<phi::Allocation>(nullptr, 0, phi::distributed::GetDefaultPlace()),
+                phi::DenseTensorMeta());
+        }}
       }}
     }}
 """
@@ -90,13 +98,15 @@
     for(auto& e: shared_dist_out){{
       dist_out.push_back(e.get());
     }}
-    std::vector<phi::DenseTensor*> dense_out(dist_out.size());
+    std::vector<phi::DenseTensor*> dense_out(dist_out.size(), nullptr);
     for (size_t i=0; i<dist_out.size(); i++) {{
-      dense_out[i] = dist_out[i]->unsafe_mutable_value();
-      if (dense_out[i] && !rank_is_in_current_mesh && !dist_out[i]->defined()) {{
-        *dense_out[i] = phi::DenseTensor(
-              std::make_shared<phi::Allocation>(nullptr, 0, phi::distributed::GetDefaultPlace()),
-              phi::DenseTensorMeta());
+      if (dist_out[i]) {{
+        dense_out[i] = dist_out[i]->unsafe_mutable_value();
+        if (dense_out[i] && !rank_is_in_current_mesh && !dist_out[i]->defined()) {{
+            *dense_out[i] = phi::DenseTensor(
+                std::make_shared<phi::Allocation>(nullptr, 0, phi::distributed::GetDefaultPlace()),
+                phi::DenseTensorMeta());
+        }}
       }}
     }}
 """
@@ -108,13 +118,15 @@
     for(auto& e: shared_dist_out){{
       dist_out.push_back(e.get());
     }}
-    std::vector<phi::DenseTensor*> dense_out(dist_out.size());
+    std::vector<phi::DenseTensor*> dense_out(dist_out.size(), nullptr);
     for (size_t i=0; i<dist_out.size(); i++) {{
-      dense_out[i] = dist_out[i]->unsafe_mutable_value();
-      if (dense_out[i] && !rank_is_in_current_mesh && !dist_out[i]->defined()) {{
-        *dense_out[i] = phi::DenseTensor(
-              std::make_shared<phi::Allocation>(nullptr, 0, phi::distributed::GetDefaultPlace()),
-              phi::DenseTensorMeta());
+      if (dist_out[i]) {{
+        dense_out[i] = dist_out[i]->unsafe_mutable_value();
+        if (dense_out[i] && !rank_is_in_current_mesh && !dist_out[i]->defined()) {{
+            *dense_out[i] = phi::DenseTensor(
+                std::make_shared<phi::Allocation>(nullptr, 0, phi::distributed::GetDefaultPlace()),
+                phi::DenseTensorMeta());
+        }}
       }}
     }}
 """
@@ -156,13 +168,15 @@
 """
 MULTI_VECTOR_OUT_CREATION_TEMPLATE = """
     auto dist_out_{i} = SetKernelDistOutput({name});
-    std::vector<phi::DenseTensor*> dense_out_{i}(dist_out_{i}.size());
+    std::vector<phi::DenseTensor*> dense_out_{i}(dist_out_{i}.size(), nullptr);
     for (size_t i = 0; i < dist_out_{i}.size(); i++) {{
-      dense_out_{i}[i] = const_cast<phi::DenseTensor*>(&dist_out_{i}[i]->value());
-      if (dense_out_{i}[i] && !rank_is_in_current_mesh && !dist_out_{i}[i]->defined()) {{
-        *dense_out_{i}[i]= phi::DenseTensor(
-            std::make_shared<phi::Allocation>(nullptr, 0, phi::distributed::GetDefaultPlace()),
-            phi::DenseTensorMeta());
+      if (dist_out_{i}[i]) {{
+        dense_out_{i}[i] = const_cast<phi::DenseTensor*>(&dist_out_{i}[i]->value());
+        if (dense_out_{i}[i] && !rank_is_in_current_mesh && !dist_out_{i}[i]->defined()) {{
+            *dense_out_{i}[i]= phi::DenseTensor(
+                std::make_shared<phi::Allocation>(nullptr, 0, phi::distributed::GetDefaultPlace()),
+                phi::DenseTensorMeta());
+        }}
       }}
     }}
 """
diff --git a/paddle/phi/api/lib/api_gen_utils.cc b/paddle/phi/api/lib/api_gen_utils.cc
index ef5cfc90727ff..c6426898371d2 100644
--- a/paddle/phi/api/lib/api_gen_utils.cc
+++ b/paddle/phi/api/lib/api_gen_utils.cc
@@ -736,6 +736,7 @@ std::shared_ptr<phi::distributed::DistTensor> CreateKernelDistOutput(
     }
     return dist_output;
   }
+  VLOG(4) << "CreateKernelDistOutput with NULL out";
   return nullptr;
 }
 
diff --git a/paddle/phi/api/lib/context_pool.cc b/paddle/phi/api/lib/context_pool.cc
index ee1e21a58e2f1..e2eb1af09d8a5 100644
--- a/paddle/phi/api/lib/context_pool.cc
+++ b/paddle/phi/api/lib/context_pool.cc
@@ -23,8 +23,7 @@ limitations under the License. */
 #include "paddle/phi/core/cuda_stream.h"
 #endif
 
-namespace paddle {
-namespace experimental {
+namespace paddle::experimental {
 
 void DeviceContextPool::SyncDeviceContext(const Place& place) {
   if (!phi::DeviceContextPool::IsInitialized()) {
@@ -64,8 +63,7 @@ phi::DeviceContext* DeviceContextPool::GetMutable(const Place& place) {
   return const_cast<phi::DeviceContext*>(Get(place));  // NOLINT
 }
 
-}  // namespace experimental
-}  // namespace paddle
+}  // namespace paddle::experimental
 
 namespace paddle {
 
diff --git a/paddle/phi/backends/dynload/cublas.cc b/paddle/phi/backends/dynload/cublas.cc
index 2fe9ae774bf7a..b870a90cb091c 100644
--- a/paddle/phi/backends/dynload/cublas.cc
+++ b/paddle/phi/backends/dynload/cublas.cc
@@ -14,8 +14,7 @@ limitations under the License. */
 
 #include "paddle/phi/backends/dynload/cublas.h"
 
-namespace phi {
-namespace dynload {
+namespace phi::dynload {
 std::once_flag cublas_dso_flag;
 void *cublas_dso_handle = nullptr;
 
@@ -34,5 +33,4 @@ CUBLAS_BLAS_ROUTINE_EACH_R3(DEFINE_WRAP);
 #ifdef CUBLAS_BLAS_ROUTINE_EACH_R4
 CUBLAS_BLAS_ROUTINE_EACH_R4(DEFINE_WRAP);
 #endif
-}  // namespace dynload
-}  // namespace phi
+}  // namespace phi::dynload
diff --git a/paddle/phi/backends/dynload/cublas.h b/paddle/phi/backends/dynload/cublas.h
index 8053bbb6bd2ce..6da85283d6e71 100644
--- a/paddle/phi/backends/dynload/cublas.h
+++ b/paddle/phi/backends/dynload/cublas.h
@@ -94,8 +94,14 @@ extern void *cublas_dso_handle;
   __macro(cublasSgetriBatched);           \
   __macro(cublasDgetrfBatched);           \
   __macro(cublasDgetriBatched);           \
+  __macro(cublasCgetrfBatched);           \
+  __macro(cublasCgetriBatched);           \
+  __macro(cublasZgetrfBatched);           \
+  __macro(cublasZgetriBatched);           \
   __macro(cublasSmatinvBatched);          \
   __macro(cublasDmatinvBatched);          \
+  __macro(cublasCmatinvBatched);          \
+  __macro(cublasZmatinvBatched);          \
   __macro(cublasSgetrsBatched);           \
   __macro(cublasDgetrsBatched);
 
diff --git a/paddle/phi/backends/dynload/cusparse.cc b/paddle/phi/backends/dynload/cusparse.cc
index ce8f87dc3cdfa..9d89b746df5b7 100644
--- a/paddle/phi/backends/dynload/cusparse.cc
+++ b/paddle/phi/backends/dynload/cusparse.cc
@@ -14,8 +14,7 @@ limitations under the License. */
 
 #include "paddle/phi/backends/dynload/cusparse.h"
 
-namespace phi {
-namespace dynload {
+namespace phi::dynload {
 
 std::once_flag cusparse_dso_flag;
 void *cusparse_dso_handle;
@@ -34,5 +33,4 @@ CUSPARSE_ROUTINE_EACH_R2(DEFINE_WRAP);
 CUSPARSE_ROUTINE_EACH_R3(DEFINE_WRAP);
 #endif
 
-}  // namespace dynload
-}  // namespace phi
+}  // namespace phi::dynload
diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc
index 612a959fc307b..5d8e26732196d 100644
--- a/paddle/phi/backends/dynload/dynamic_loader.cc
+++ b/paddle/phi/backends/dynload/dynamic_loader.cc
@@ -351,14 +351,14 @@ void* GetCublasDsoHandle() {
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib");
 #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
   if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
-#ifdef WITH_PIP_CUDA_LIBRARIES
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cublas64_11.dll");
 #else
     return GetDsoHandleFromSearchPath(
         FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path});
 #endif
   } else if (CUDA_VERSION >= 12000 && CUDA_VERSION <= 12030) {
-#ifdef WITH_PIP_CUDA_LIBRARIES
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cublas64_12.dll");
 #else
     return GetDsoHandleFromSearchPath(
@@ -372,13 +372,13 @@ void* GetCublasDsoHandle() {
   }
 #elif defined(__linux__) && defined(PADDLE_WITH_CUDA)
   if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
-#ifdef WITH_PIP_CUDA_LIBRARIES
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublas.so.11");
 #else
     return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublas.so");
 #endif
   } else if (CUDA_VERSION >= 12000 && CUDA_VERSION <= 12030) {
-#ifdef WITH_PIP_CUDA_LIBRARIES
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublas.so.12");
 #else
     return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublas.so");
@@ -400,13 +400,13 @@ void* GetCublasLtDsoHandle() {
 // APIs available after CUDA 10.1
 #if defined(__linux__) && defined(PADDLE_WITH_CUDA)
   if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
-#ifdef WITH_PIP_CUDA_LIBRARIES
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublasLt.so.11");
 #else
     return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublasLt.so");
 #endif
   } else if (CUDA_VERSION >= 12000 && CUDA_VERSION <= 12030) {
-#ifdef WITH_PIP_CUDA_LIBRARIES
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublasLt.so.12");
 #else
     return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublasLt.so");
@@ -448,7 +448,7 @@ void* GetCUDNNDsoHandle() {
       "You should do this according to your CUDA installation directory and "
       "CUDNN version.");
   if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12030) {
-#ifdef WITH_PIP_CUDA_LIBRARIES
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(
         FLAGS_cuda_dir, "cudnn64_8.dll", true, {cuda_lib_path}, win_warn_meg);
 #else
@@ -456,7 +456,7 @@ void* GetCUDNNDsoHandle() {
         FLAGS_cuda_dir, win_cudnn_lib, true, {cuda_lib_path}, win_warn_meg);
 #endif
   } else if (CUDA_VERSION >= 12030) {
-#ifdef WITH_PIP_CUDA_LIBRARIES
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(
         FLAGS_cuda_dir, "cudnn64_9.dll", true, {cuda_lib_path}, win_warn_meg);
 #else
@@ -467,7 +467,7 @@ void* GetCUDNNDsoHandle() {
 #elif defined(PADDLE_WITH_HIP)
   return GetDsoHandleFromSearchPath(FLAGS_miopen_dir, "libMIOpen.so", false);
 #else
-#ifdef WITH_PIP_CUDA_LIBRARIES
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
   if (CUDA_VERSION >= 12030) {
     return GetDsoHandleFromSearchPath(
         FLAGS_cudnn_dir, "libcudnn.so.9", false, {cuda_lib_path});
@@ -488,7 +488,7 @@ void* GetCUPTIDsoHandle() {
       FLAGS_cupti_dir, "libcupti.dylib", false, {cupti_lib_path});
 #elif defined(__linux__) && defined(PADDLE_WITH_CUDA)
   if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
-#ifdef WITH_PIP_CUDA_LIBRARIES
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(
         FLAGS_cupti_dir, "libcupti.so.11.8", false, {cupti_lib_path});
 #else
@@ -497,7 +497,7 @@ void* GetCUPTIDsoHandle() {
 #endif
 
   } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 12030) {
-#ifdef WITH_PIP_CUDA_LIBRARIES
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(
         FLAGS_cupti_dir, "libcupti.so.12", false, {cupti_lib_path});
 #else
@@ -520,7 +520,7 @@ void* GetCurandDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib");
 #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-#ifdef WITH_PIP_CUDA_LIBRARIES
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
   return GetDsoHandleFromSearchPath(
       FLAGS_cuda_dir, "curand64_10.dll", true, {cuda_lib_path});
 #else
@@ -530,7 +530,7 @@ void* GetCurandDsoHandle() {
 #elif defined(PADDLE_WITH_HIP)
   return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprand.so");
 #else
-#ifdef WITH_PIP_CUDA_LIBRARIES
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
   return GetDsoHandleFromSearchPath(FLAGS_curand_dir, "libcurand.so.10");
 #else
   return GetDsoHandleFromSearchPath(FLAGS_curand_dir, "libcurand.so");
@@ -564,7 +564,7 @@ void* GetCusolverDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.dylib");
 #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-#ifdef WITH_PIP_CUDA_LIBRARIES
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
   return GetDsoHandleFromSearchPath(
       FLAGS_cuda_dir, "cusolver64_11.dll", true, {cuda_lib_path});
 #else
@@ -572,7 +572,7 @@ void* GetCusolverDsoHandle() {
       FLAGS_cuda_dir, win_cusolver_lib, true, {cuda_lib_path});
 #endif
 #else
-#ifdef WITH_PIP_CUDA_LIBRARIES
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.so.11");
 #else
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.so");
@@ -585,14 +585,14 @@ void* GetCusparseDsoHandle() {
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusparse.dylib");
 #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
   if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
-#ifdef WITH_PIP_CUDA_LIBRARIES
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cusparse64_11.dll");
 #else
     return GetDsoHandleFromSearchPath(
         FLAGS_cuda_dir, win_cusparse_lib, true, {cuda_lib_path});
 #endif
   } else if (CUDA_VERSION >= 12000 && CUDA_VERSION <= 12030) {
-#ifdef WITH_PIP_CUDA_LIBRARIES
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cusparse64_12.dll");
 #else
     return GetDsoHandleFromSearchPath(
@@ -606,13 +606,13 @@ void* GetCusparseDsoHandle() {
   }
 #elif defined(__linux__) && defined(PADDLE_WITH_CUDA)
   if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
-#ifdef WITH_PIP_CUDA_LIBRARIES
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(FLAGS_cusparse_dir, "libcusparse.so.11");
 #else
     return GetDsoHandleFromSearchPath(FLAGS_cusparse_dir, "libcusparse.so");
 #endif
   } else if (CUDA_VERSION >= 12000 && CUDA_VERSION <= 12030) {
-#ifdef WITH_PIP_CUDA_LIBRARIES
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(FLAGS_cusparse_dir, "libcusparse.so.12");
 #else
     return GetDsoHandleFromSearchPath(FLAGS_cusparse_dir, "libcusparse.so");
@@ -716,7 +716,7 @@ void* GetNCCLDsoHandle() {
   return GetDsoHandleFromSearchPath(
       FLAGS_rccl_dir, "librccl.so", true, {}, warning_msg);
 #else
-#ifdef WITH_PIP_CUDA_LIBRARIES
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
   return GetDsoHandleFromSearchPath(
       FLAGS_nccl_dir, "libnccl.so;libnccl.so.2", true, {}, warning_msg);
 #else
@@ -782,7 +782,7 @@ void* GetCUFFTDsoHandle() {
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.dylib");
 #elif defined(__linux__) && defined(PADDLE_WITH_CUDA)
   if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
-#ifdef WITH_PIP_CUDA_LIBRARIES
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.so.10");
 #else
     return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.so");
@@ -797,14 +797,14 @@ void* GetCUFFTDsoHandle() {
   }
 #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
   if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
-#ifdef WITH_PIP_CUDA_LIBRARIES
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cufft64_10.dll");
 #else
     return GetDsoHandleFromSearchPath(
         FLAGS_cuda_dir, win_cufft_lib, true, {cuda_lib_path});
 #endif
   } else if (CUDA_VERSION >= 12000 && CUDA_VERSION <= 12030) {
-#ifdef WITH_PIP_CUDA_LIBRARIES
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cufft64_11.dll");
 #else
     return GetDsoHandleFromSearchPath(
diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc
index fc0f8ee1e35e1..d3a569b34c5ac 100644
--- a/paddle/phi/backends/xpu/xpu2_op_list.cc
+++ b/paddle/phi/backends/xpu/xpu2_op_list.cc
@@ -1055,7 +1055,8 @@ XPUOpMap& get_kl2_ops() {
                      phi::DataType::INT64,
                      phi::DataType::BOOL,
                      phi::DataType::FLOAT64,
-                     phi::DataType::FLOAT32})},
+                     phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16})},
       {"tile_grad", XPUKernelSet({phi::DataType::FLOAT32})},
       {"transpose2_grad",
        XPUKernelSet({phi::DataType::FLOAT32,
@@ -1248,6 +1249,7 @@ XPUOpMap& get_kl2_ops() {
        XPUKernelSet({phi::DataType::FLOAT16, phi::DataType::FLOAT32})},
       {"sequence_unpad_xpu",
        XPUKernelSet({phi::DataType::FLOAT16, phi::DataType::FLOAT32})},
+      {"block_multihead_attention_xpu", XPUKernelSet({phi::DataType::FLOAT16})},
   };
 
   return s_xpu2_kernels;
diff --git a/paddle/phi/core/distributed/auto_parallel/auto_parallel.proto b/paddle/phi/core/distributed/auto_parallel/auto_parallel.proto
index 70c9e72aa5fe7..71c18ac426019 100644
--- a/paddle/phi/core/distributed/auto_parallel/auto_parallel.proto
+++ b/paddle/phi/core/distributed/auto_parallel/auto_parallel.proto
@@ -25,7 +25,7 @@ message ProcessMeshProto {
   // There are no duplicate process ids within one process mesh.
   repeated int64 process_ids = 2;
 
-  // The name of each dimension. 
+  // The name of each dimension.
   repeated string dim_names = 3;
 
 }
@@ -37,17 +37,17 @@ message TensorDistAttrProto {
   optional ProcessMeshProto process_mesh = 1;
 
   // The length of dims_mapping is same as the length of the tensor shape.
-  // The i-th dimension of the tensor will be sharded by the dims_mapping[i]-th dimension 
+  // The i-th dimension of the tensor will be sharded by the dims_mapping[i]-th dimension
   // of the above process mesh. If dims_mapping[i] is -1, the i-th dimension of the tensor
   // will not be sharded. For example, given a tensor shape [2, 6, 12], a process mesh
   // shape [2, 3] and a dims_mapping [-1, 1, 0], each sharded tensor will have a shape [2, 2, 6].
   repeated int64 dims_mapping = 2;
 
-  // The batch dimension of the corresponding tensor. 
+  // The batch dimension of the corresponding tensor.
   optional int64 batch_dim = 3;
 
-  // If the dynamic_dims[i] is True, the i-th dimension of the corresponding tensor 
-  // is dynamic changed. Otherwise, the i-th dimension of the tensor is static determined. 
+  // If the dynamic_dims[i] is True, the i-th dimension of the corresponding tensor
+  // is dynamic changed. Otherwise, the i-th dimension of the tensor is static determined.
   repeated bool dynamic_dims = 4;
 
   // This field is used to distinguish vars which are in same process_mesh and in different vpp chunk
@@ -60,16 +60,16 @@ message OperatorDistAttrProto {
   message TensorDistAttrMappingEntryProto {
     optional string name = 1;
     optional TensorDistAttrProto tensor_dist_attr = 2;
-  } 
+  }
   // The key of this map is the input tensor name and the value is the distributed attribute
-  // of the input tensor required by this corresponding operator.   
-  // The distributed attribute of the actual tensor may be not the same as that within 
+  // of the input tensor required by this corresponding operator.
+  // The distributed attribute of the actual tensor may be not the same as that within
   // the distributed attribute of the operator.
   repeated TensorDistAttrMappingEntryProto input_dist_attrs = 1;
 
   // The key of this map is the output tensor name and the value is the distributed attribute
-  // of the output tensor required by this corresponding operator.   
-  // The distributed attribute of the actual tensor may be not the same as that within 
+  // of the output tensor required by this corresponding operator.
+  // The distributed attribute of the actual tensor may be not the same as that within
   // the distributed attribute of the operator.
   repeated TensorDistAttrMappingEntryProto output_dist_attrs = 2;
 
@@ -81,7 +81,7 @@ message OperatorDistAttrProto {
   // may shared the same distributed operator, the field is use for this scenario.
   optional string impl_type = 4;
 
-  // This field tells which distributed implementations of this corresponding operator 
+  // This field tells which distributed implementations of this corresponding operator
   // will be selected for the actual computation.
   optional int64 impl_idx = 5;
 
@@ -115,13 +115,13 @@ message DeviceProto {
   optional string type = 4;
 
   // The capability of this device.
-  optional DeviceCapabilityProto capability = 5; 
+  optional DeviceCapabilityProto capability = 5;
 }
 
-// This proto describes the capability of the link between two devices.        
-message LinkCapabilityProto {        
-  optional int64 bandwidth = 1; // Bytes/s       
-  optional int64 latency = 2;        
+// This proto describes the capability of the link between two devices.
+message LinkCapabilityProto {
+  optional int64 bandwidth = 1; // Bytes/s
+  optional int64 latency = 2;
 }
 
 message LinkProto {
@@ -133,14 +133,14 @@ message LinkProto {
 
   // Represent the link type.
   optional string type = 3;
-      
+
   // The capability of this link.
-  optional LinkCapabilityProto capability = 4; 
+  optional LinkCapabilityProto capability = 4;
 }
 
 // DeviceMesh is used to organize devices and like n-dimension array.
 message DeviceMeshProto {
-  // The global id of this mesh. 
+  // The global id of this mesh.
   optional string name = 1;
 
   // The size of each dimension.
@@ -150,13 +150,13 @@ message DeviceMeshProto {
   // There are no duplicate device ids within one device mesh.
   repeated int64 device_ids = 3;
 
-  // The name of each dimension. 
+  // The name of each dimension.
   repeated string dim_names = 4;
 
   // The devices of this mesh.
   repeated DeviceProto devices = 5;
 
-  // The links are between devices. 
+  // The links are between devices.
   repeated LinkProto links = 6;
 }
 
diff --git a/paddle/phi/core/distributed/auto_parallel/dist_attr.cc b/paddle/phi/core/distributed/auto_parallel/dist_attr.cc
index 62fbd97c46ab2..98dfa339589a5 100644
--- a/paddle/phi/core/distributed/auto_parallel/dist_attr.cc
+++ b/paddle/phi/core/distributed/auto_parallel/dist_attr.cc
@@ -21,8 +21,7 @@ limitations under the License. */
 #include "glog/logging.h"
 #include "paddle/phi/core/distributed/auto_parallel/proto_helper.h"
 
-namespace phi {
-namespace distributed {
+namespace phi::distributed {
 using phi::distributed::auto_parallel::str_join;
 using phi::distributed::auto_parallel::TensorDistAttrProto;
 
@@ -450,5 +449,4 @@ bool TensorDistAttr::is_partial(int64_t mesh_axis) const {
 
 void TensorDistAttr::set_skip_check_mesh(bool skip) { skip_check_mesh_ = skip; }
 
-}  // namespace distributed
-}  // namespace phi
+}  // namespace phi::distributed
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard/x_to_r_reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/reshard/x_to_r_reshard_function.cc
index bd415480d64e9..947a4b77f6961 100644
--- a/paddle/phi/core/distributed/auto_parallel/reshard/x_to_r_reshard_function.cc
+++ b/paddle/phi/core/distributed/auto_parallel/reshard/x_to_r_reshard_function.cc
@@ -25,8 +25,7 @@
 #include "paddle/phi/kernels/p_recv_kernel.h"
 #include "paddle/phi/kernels/p_send_kernel.h"
 
-namespace phi {
-namespace distributed {
+namespace phi::distributed {
 
 bool XToRShrinkReshardFunction::IsSuitable(
     const DistTensor& in, const TensorDistAttr& out_dist_attr) {
@@ -130,5 +129,4 @@ void XToRShrinkReshardFunction::Eval(phi::DeviceContext* dev_ctx,
   }
 }
 
-}  // namespace distributed
-}  // namespace phi
+}  // namespace phi::distributed
diff --git a/paddle/phi/core/infermeta_utils.h b/paddle/phi/core/infermeta_utils.h
index 494fe160696ff..e63ced99ec539 100644
--- a/paddle/phi/core/infermeta_utils.h
+++ b/paddle/phi/core/infermeta_utils.h
@@ -49,6 +49,8 @@ class InferMetaContext {
   void EmplaceBackOutputs(
       paddle::small_vector<MetaTensor, phi::kOutputSmallVectorSize> outputs);
 
+  void UpdataInput(size_t idx, MetaTensor input) { inputs_[idx] = input; }
+
   TEST_API virtual const MetaTensor& InputAt(size_t idx) const;
 
   TEST_API virtual std::vector<const MetaTensor*> InputsBetween(
@@ -68,6 +70,10 @@ class InferMetaContext {
   const std::pair<int, int>& InputRangeAt(size_t idx) const;
   TEST_API const std::pair<int, int>& OutputRangeAt(size_t idx) const;
 
+  size_t InputsSize() const { return inputs_.size(); }
+  size_t OutputsSize() const { return outputs_.size(); }
+  size_t AttrsSize() const { return attrs_.size(); }
+
   virtual ~InferMetaContext() = default;
 
  protected:
diff --git a/paddle/phi/core/kernel_context.h b/paddle/phi/core/kernel_context.h
index 947af3af1d089..5fa75214fcfb5 100644
--- a/paddle/phi/core/kernel_context.h
+++ b/paddle/phi/core/kernel_context.h
@@ -75,6 +75,10 @@ class KernelContext {
 
   void AssignOutputRange(std::pair<int, int>&& range, size_t idx);
 
+  void UpdataInput(size_t idx, const TensorBase* input) {
+    inputs_[idx] = input;
+  }
+
   template <typename TensorType>
   const TensorType& InputAt(size_t idx) const {
     return static_cast<const TensorType&>(*(inputs_.at(idx)));
diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc
index e16ec77a3b0e1..23259d40093af 100644
--- a/paddle/phi/infermeta/backward.cc
+++ b/paddle/phi/infermeta/backward.cc
@@ -445,6 +445,34 @@ void CudnnLSTMGradInferMeta(
   }
 }
 
+void LSTMGradInferMeta(const MetaTensor& input,
+                       const MetaTensor& h0,
+                       const MetaTensor& c0,
+                       const MetaTensor& weight,
+                       const MetaTensor& bias,
+                       MetaTensor* input_grad,
+                       MetaTensor* h0_grad,
+                       MetaTensor* c0_grad,
+                       MetaTensor* weight_grad,
+                       MetaTensor* bias_grad,
+                       MetaConfig config) {
+  if (input_grad) {
+    input_grad->share_meta(input);
+  }
+  if (h0_grad) {
+    h0_grad->share_meta(h0);
+  }
+  if (c0_grad) {
+    c0_grad->share_meta(c0);
+  }
+  if (weight_grad) {
+    weight_grad->share_meta(weight);
+  }
+  if (bias_grad) {
+    bias_grad->share_meta(bias);
+  }
+}
+
 void DeformableConvGradInferMeta(const MetaTensor& x,
                                  const MetaTensor& offset,
                                  const MetaTensor& filter,
diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h
index e9971b5042ac0..89795c008d34d 100644
--- a/paddle/phi/infermeta/backward.h
+++ b/paddle/phi/infermeta/backward.h
@@ -167,6 +167,18 @@ void CudnnLSTMGradInferMeta(
     MetaTensor* init_c_grad,
     std::vector<MetaTensor*> weight_list_grad);
 
+void LSTMGradInferMeta(const MetaTensor& input,
+                       const MetaTensor& h0,
+                       const MetaTensor& c0,
+                       const MetaTensor& weight,
+                       const MetaTensor& bias,
+                       MetaTensor* input_grad,
+                       MetaTensor* h0_grad,
+                       MetaTensor* c0_grad,
+                       MetaTensor* weight_grad,
+                       MetaTensor* bias_grad,
+                       MetaConfig config = MetaConfig());
+
 void DeformableConvGradInferMeta(const MetaTensor& x,
                                  const MetaTensor& offset,
                                  const MetaTensor& filter,
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index 3c3ef874854ab..aa4028efa1a6e 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -3573,6 +3573,45 @@ void TakeAlongAxisInferMeta(const MetaTensor& x,
   out->set_dtype(x.dtype());
 }
 
+void TdmChildInferMeta(const MetaTensor& x,
+                       const MetaTensor& tree_info,
+                       int child_nums,
+                       DataType dtype,
+                       MetaTensor* child,
+                       MetaTensor* leaf_mask) {
+  PADDLE_ENFORCE_GT(
+      child_nums,
+      0,
+      phi::errors::InvalidArgument(
+          "ValueError: The value of the 'child_nums' must greater than 0. "
+          "But received child_nums value = %d, ",
+          child_nums));
+
+  const auto& info_dims = tree_info.dims();
+  const auto& input_dims = x.dims();
+
+  PADDLE_ENFORCE_EQ(
+      info_dims.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "ShapeError: The dimensions of the 'tree info' must be 2. "
+          "But received tree info's dimensions = %d, "
+          "tree info's shape = [%s].",
+          info_dims.size(),
+          info_dims));
+
+  auto output_dims = common::vectorize(input_dims);
+  output_dims.push_back(child_nums);
+  if (child != nullptr) {
+    child->set_dims(common::make_ddim(output_dims));
+    leaf_mask->set_dims(common::make_ddim(output_dims));
+    child->share_lod(x);
+    leaf_mask->share_lod(x);
+    child->set_dtype(x.dtype());
+    leaf_mask->set_dtype(x.dtype());
+  }
+}
+
 void TriangularSolveInferMeta(const MetaTensor& x,
                               const MetaTensor& y,
                               bool upper,
diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
index e166746e3a646..391d01debd7a3 100644
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -635,6 +635,13 @@ void TakeAlongAxisInferMeta(const MetaTensor& x,
                             int axis,
                             MetaTensor* out);
 
+void TdmChildInferMeta(const MetaTensor& x,
+                       const MetaTensor& tree_info,
+                       int child_nums,
+                       DataType dtype,
+                       MetaTensor* child,
+                       MetaTensor* leaf_mask);
+
 void TriangularSolveInferMeta(const MetaTensor& x,
                               const MetaTensor& y,
                               bool upper,
diff --git a/paddle/phi/infermeta/fusion.cc b/paddle/phi/infermeta/fusion.cc
index 9987524d4997d..5cba3aa1c1a29 100644
--- a/paddle/phi/infermeta/fusion.cc
+++ b/paddle/phi/infermeta/fusion.cc
@@ -377,6 +377,89 @@ void BlockMultiheadAttentionInferMeta(const MetaTensor& qkv,
   }
 }
 
+void BlockMultiheadAttentionInferXPUMeta(
+    const MetaTensor& qkv,
+    const MetaTensor& key_cache,
+    const MetaTensor& value_cache,
+    const MetaTensor& seq_lens_encoder,
+    const MetaTensor& seq_lens_decoder,
+    const MetaTensor& seq_lens_this_time,
+    const MetaTensor& padding_offsets,
+    const MetaTensor& cum_offsets,
+    const MetaTensor& cu_seqlens_q,
+    const MetaTensor& cu_seqlens_k,
+    const MetaTensor& cache_k_per_batch_maxs,
+    const MetaTensor& cache_v_per_batch_maxs,
+    const MetaTensor& block_tables,
+    const MetaTensor& pre_key_cache,
+    const MetaTensor& pre_value_cache,
+    const MetaTensor& rope_emb,
+    const MetaTensor& mask,
+    const MetaTensor& tgt_mask,
+    const MetaTensor& cache_k_quant_scales,
+    const MetaTensor& cache_v_quant_scales,
+    const MetaTensor& cache_k_dequant_scales,
+    const MetaTensor& cache_v_dequant_scales,
+    const MetaTensor& qkv_out_scale,
+    const MetaTensor& qkv_bias,
+    const MetaTensor& out_shift,
+    const MetaTensor& out_smooth,
+    const MetaTensor& max_enc_len_this_time,
+    const MetaTensor& max_dec_len_this_time,
+    int max_seq_len,
+    int block_size,
+    bool use_neox_style,
+    bool dynamic_cachekv_quant,
+    const int quant_round_type,
+    const float quant_max_bound,
+    const float quant_min_bound,
+    const float out_scale,
+    const std::string& compute_dtype,
+    MetaTensor* fmha_out,
+    MetaTensor* qkv_out,
+    MetaTensor* key_cache_out,
+    MetaTensor* value_cache_out) {
+  BlockMultiheadAttentionInferMeta(qkv,
+                                   key_cache,
+                                   value_cache,
+                                   seq_lens_encoder,
+                                   seq_lens_decoder,
+                                   seq_lens_this_time,
+                                   padding_offsets,
+                                   cum_offsets,
+                                   cu_seqlens_q,
+                                   cu_seqlens_k,
+                                   block_tables,
+                                   pre_key_cache,
+                                   pre_value_cache,
+                                   rope_emb,
+                                   mask,
+                                   tgt_mask,
+                                   cache_k_quant_scales,
+                                   cache_v_quant_scales,
+                                   cache_k_dequant_scales,
+                                   cache_v_dequant_scales,
+                                   qkv_out_scale,
+                                   qkv_bias,
+                                   out_shift,
+                                   out_smooth,
+                                   max_enc_len_this_time,
+                                   max_dec_len_this_time,
+                                   max_seq_len,
+                                   block_size,
+                                   use_neox_style,
+                                   dynamic_cachekv_quant,
+                                   quant_round_type,
+                                   quant_max_bound,
+                                   quant_min_bound,
+                                   out_scale,
+                                   compute_dtype,
+                                   fmha_out,
+                                   qkv_out,
+                                   key_cache_out,
+                                   value_cache_out);
+}
+
 void Conv1dXPUInferMeta(const MetaTensor& x,
                         const MetaTensor& x_max,
                         const MetaTensor& filter,
diff --git a/paddle/phi/infermeta/fusion.h b/paddle/phi/infermeta/fusion.h
index aa48f64434ee3..989c0dd28a1b4 100644
--- a/paddle/phi/infermeta/fusion.h
+++ b/paddle/phi/infermeta/fusion.h
@@ -128,6 +128,49 @@ void BlockMultiheadAttentionInferMeta(const MetaTensor& qkv,
                                       MetaTensor* key_cache_out,
                                       MetaTensor* value_cache_out);
 
+void BlockMultiheadAttentionInferXPUMeta(
+    const MetaTensor& qkv,
+    const MetaTensor& key_cache,
+    const MetaTensor& value_cache,
+    const MetaTensor& seq_lens_encoder,
+    const MetaTensor& seq_lens_decoder,
+    const MetaTensor& seq_lens_this_time,
+    const MetaTensor& padding_offsets,
+    const MetaTensor& cum_offsets,
+    const MetaTensor& cu_seqlens_q,
+    const MetaTensor& cu_seqlens_k,
+    const MetaTensor& cache_k_per_batch_maxs,
+    const MetaTensor& cache_v_per_batch_maxs,
+    const MetaTensor& block_tables,
+    const MetaTensor& pre_key_cache,
+    const MetaTensor& pre_value_cache,
+    const MetaTensor& rope_emb,
+    const MetaTensor& mask,
+    const MetaTensor& tgt_mask,
+    const MetaTensor& cache_k_quant_scales,
+    const MetaTensor& cache_v_quant_scales,
+    const MetaTensor& cache_k_dequant_scales,
+    const MetaTensor& cache_v_dequant_scales,
+    const MetaTensor& qkv_out_scale,
+    const MetaTensor& qkv_bias,
+    const MetaTensor& out_shift,
+    const MetaTensor& out_smooth,
+    const MetaTensor& max_enc_len_this_time,
+    const MetaTensor& max_dec_len_this_time,
+    int max_seq_len,
+    int block_size,
+    bool use_neox_style,
+    bool dynamic_cachekv_quant,
+    const int quant_round_type,
+    const float quant_max_bound,
+    const float quant_min_bound,
+    const float out_scale,
+    const std::string& compute_dtype,
+    MetaTensor* fmha_out,
+    MetaTensor* qkv_out,
+    MetaTensor* key_cache_out,
+    MetaTensor* value_cache_out);
+
 void Conv1dXPUInferMeta(const MetaTensor& x,
                         const MetaTensor& x_max,
                         const MetaTensor& filter,
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index a80997970f8fb..84d0e7ffaf469 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -1349,6 +1349,115 @@ void CudnnLSTMInferMeta(
   state_out->set_dtype(phi::DataType::UINT8);
 }
 
+void LSTMInferMeta(const MetaTensor& input,
+                   const MetaTensor& h0,
+                   const MetaTensor& c0,
+                   const MetaTensor& weight,
+                   const MetaTensor& bias,
+                   bool use_peepholes,
+                   bool is_reverse,
+                   bool is_test,
+                   const std::string& gate_activation,
+                   const std::string& cell_activation,
+                   const std::string& candidate_activation,
+                   MetaTensor* hidden,
+                   MetaTensor* cell,
+                   MetaTensor* batch_gate,
+                   MetaTensor* batch_cell_pre_act,
+                   MetaConfig config) {
+  const auto& in_dims = input.dims();
+  PADDLE_ENFORCE_EQ(
+      in_dims.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "Input(X)'s rank must be 2, but received %d.", in_dims.size()));
+
+  if (h0) {
+    PADDLE_ENFORCE_EQ(
+        c0.initialized(),
+        true,
+        phi::errors::NotFound("Input(Cell) and Input(Hidden) of LSTM "
+                              "should not be null at the same time."));
+    const auto& h_dims = h0.dims();
+    const auto& c_dims = c0.dims();
+    PADDLE_ENFORCE_EQ(h_dims,
+                      c_dims,
+                      phi::errors::InvalidArgument(
+                          "The dimension of Input(H0) and Input(C0) should "
+                          "be the same, but received [%s] (H0) vs [%s] (C0).",
+                          h_dims,
+                          c_dims));
+  }
+
+  int frame_size = static_cast<int>(in_dims[1] / 4);
+  const auto& w_dims = weight.dims();
+  PADDLE_ENFORCE_EQ(
+      w_dims.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "The rank of Input(Weight) should be 2, but received %d.",
+          w_dims.size()));
+  PADDLE_ENFORCE_EQ(w_dims[0],
+                    frame_size,
+                    phi::errors::InvalidArgument(
+                        "The first dimension of Input(Weight) should be %d, "
+                        "but received %d.",
+                        frame_size,
+                        w_dims[0]));
+  PADDLE_ENFORCE_EQ(w_dims[1],
+                    4 * frame_size,
+                    phi::errors::InvalidArgument(
+                        "The second dimension of Input(Weight) should be 4 * "
+                        "%d, but received %d.",
+                        frame_size,
+                        w_dims[1]));
+
+  const auto& b_dims = bias.dims();
+  PADDLE_ENFORCE_EQ(b_dims.size(),
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The rank of Input(Bias) should be 2, but received %d.",
+                        b_dims.size()));
+  PADDLE_ENFORCE_EQ(
+      b_dims[0],
+      1,
+      phi::errors::InvalidArgument(
+          "The first dimension of Input(Bias) should be 1, but received %d.",
+          b_dims[0]));
+
+  if (use_peepholes) {
+    PADDLE_ENFORCE_EQ(
+        b_dims[1],
+        7 * frame_size,
+        phi::errors::InvalidArgument(
+            "The second dimension of Input(Bias) should be 7 * %d if enable "
+            "peepholes connection, but received %d.",
+            frame_size,
+            b_dims[1]));
+  } else {
+    PADDLE_ENFORCE_EQ(
+        b_dims[1],
+        4 * frame_size,
+        phi::errors::InvalidArgument(
+            "The second dimension of Input(Bias) should be 4 * %d if disable "
+            "peepholes connection, but received %d.",
+            frame_size,
+            b_dims[1]));
+  }
+
+  phi::DDim out_dims({in_dims[0], frame_size});
+  hidden->set_dims(out_dims);
+  cell->set_dims(out_dims);
+  if (!is_test) {
+    batch_gate->set_dims(in_dims);
+    batch_cell_pre_act->set_dims(out_dims);
+  }
+  hidden->share_lod(input);
+  cell->share_lod(input);
+  hidden->set_dtype(input.dtype());
+  cell->set_dtype(input.dtype());
+}
+
 void DecayedAdagradInferMeta(const MetaTensor& param,
                              const MetaTensor& grad,
                              const MetaTensor& moment,
diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
index 56dff7422b2cc..a73212505f669 100644
--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -292,6 +292,23 @@ void CudnnLSTMInferMeta(
     MetaTensor* reserve,
     MetaTensor* state_out);
 
+void LSTMInferMeta(const MetaTensor& input,
+                   const MetaTensor& h0,
+                   const MetaTensor& c0,
+                   const MetaTensor& weight,
+                   const MetaTensor& bias,
+                   bool use_peepholes,
+                   bool is_reverse,
+                   bool is_test,
+                   const std::string& gate_activation,
+                   const std::string& cell_activation,
+                   const std::string& candidate_activation,
+                   MetaTensor* hidden,
+                   MetaTensor* cell,
+                   MetaTensor* batch_gate,
+                   MetaTensor* batch_cell_pre_act,
+                   MetaConfig config = MetaConfig());
+
 void DecayedAdagradInferMeta(const MetaTensor& param,
                              const MetaTensor& grad,
                              const MetaTensor& moment,
diff --git a/paddle/phi/infermeta/spmd_rules/flatten.cc b/paddle/phi/infermeta/spmd_rules/flatten.cc
index a0f084b491771..b33411e4b2518 100644
--- a/paddle/phi/infermeta/spmd_rules/flatten.cc
+++ b/paddle/phi/infermeta/spmd_rules/flatten.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h"
 #include "paddle/phi/core/distributed/auto_parallel/utils.h"
 #include "paddle/phi/infermeta/spmd_rules/dim_trans.h"
+#include "paddle/phi/infermeta/spmd_rules/reshape.h"
 #include "paddle/phi/infermeta/spmd_rules/utils.h"
 
 namespace phi {
@@ -105,41 +106,31 @@ SpmdInfo FlattenInferSpmd(const DistMetaTensor& x,
                                    x_ndim,
                                    x_dims_mapping.size()));
 
-  // Step1: Build the transformation from
-  // the original shape to the target shape
-
+  // obtain target shape and use ReshapeInferSpmdDynamic to infer
   start_axis = PreprocessAxis(start_axis, x_ndim);
   stop_axis = PreprocessAxis(stop_axis, x_ndim);
-  std::vector<std::shared_ptr<DimTrans>> trans =
-      MakeFlattenDimTrans(src_shape, start_axis, stop_axis);
-
-  // Step2: Infer the dims mapping of input (if reshard is
-  // needed) and output from the dimension transformation.
-  std::vector<std::vector<int64_t>> dims_mapping_vec =
-      InferFromDimTrans(x, trans);
-
-  // Step3: Update the dist attributes of input
-  // and output with the inferred dims mapping.
-  TensorDistAttr x_dist_attr_dst = CopyTensorDistAttrForOutput(x_dist_attr_src);
-  x_dist_attr_dst.set_dims_mapping(dims_mapping_vec[0]);
-  TensorDistAttr out_dist_attr = CopyTensorDistAttrForOutput(x_dist_attr_src);
-  out_dist_attr.set_dims_mapping(dims_mapping_vec[1]);
+  std::vector<int64_t> dst_shape;
+  int64_t flatten_size = 1;
+  for (int64_t i = 0; i < x_ndim; i++) {
+    if (i < start_axis || i > stop_axis) {
+      dst_shape.emplace_back(src_shape[i]);
+    } else {
+      flatten_size *= src_shape[i];
+      if (i == stop_axis) {
+        dst_shape.emplace_back(flatten_size);
+      }
+    }
+  }
 
   VLOG(4) << "FlattenInferSpmd: X shape: [" << str_join(src_shape) << "]";
   VLOG(4) << "Start_axis: " << start_axis;
-  VLOG(4) << "Stop_axis: " << start_axis;
-  VLOG(4) << "Transformation from input to output:";
-  for (int64_t i = 0, n = static_cast<int64_t>(trans.size()); i < n; i++) {
-    std::shared_ptr<DimTrans> t = trans[i];
-    VLOG(4) << "\tOut axis[" << i << "]: " << t->to_string();
-  }
-  VLOG(4) << "X dims_mapping_src: [" << str_join(x_dims_mapping)
-          << "] dims_mapping_dst: [" << str_join(dims_mapping_vec[0]) << "]";
-  VLOG(4) << "Out dims_mapping: [" << str_join(dims_mapping_vec[1]) << "]\n\n";
-
-  return {{x_dist_attr_dst}, {out_dist_attr}};
+  VLOG(4) << "Stop_axis: " << stop_axis;
+  VLOG(4) << "FlattenInferSpmd: output shape: [" << str_join(dst_shape) << "]";
+  VLOG(4) << "use ReshapeInferSpmdDynamic to infer distributed attribute";
+  return ReshapeInferSpmdDynamic(x, dst_shape);
 }
 
+// TODO(jeff41404): consider xshape and use ReshapeInferSpmdReverse in future
 SpmdInfo FlattenInferSpmdReverse(const DistMetaTensor& x,
                                  const DistMetaTensor& out,
                                  int start_axis,
@@ -198,5 +189,10 @@ SpmdInfo FlattenInferSpmdReverse(const DistMetaTensor& x,
   return {{x_dist_attr}, {out_dist_attr_dst}};
 }
 
+SpmdInfo FlattenGradInferSpmd(const DistMetaTensor& xshape,
+                              const DistMetaTensor& out_grad) {
+  return ReshapeGradInferSpmd(xshape, out_grad);
+}
+
 }  // namespace distributed
 }  // namespace phi
diff --git a/paddle/phi/infermeta/spmd_rules/flatten.h b/paddle/phi/infermeta/spmd_rules/flatten.h
index bb62d8c0d7b0a..28bf5e56d5256 100644
--- a/paddle/phi/infermeta/spmd_rules/flatten.h
+++ b/paddle/phi/infermeta/spmd_rules/flatten.h
@@ -30,5 +30,8 @@ SpmdInfo FlattenInferSpmdReverse(const DistMetaTensor& x,
                                  const DistMetaTensor& out,
                                  int start_axis,
                                  int stop_axis);
+
+SpmdInfo FlattenGradInferSpmd(const DistMetaTensor& xshape,
+                              const DistMetaTensor& out_grad);
 }  // namespace distributed
 }  // namespace phi
diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc
index 6c278867d9ac3..d4731ce7afd3c 100644
--- a/paddle/phi/infermeta/ternary.cc
+++ b/paddle/phi/infermeta/ternary.cc
@@ -645,6 +645,141 @@ void GlobalScatterInferMeta(const MetaTensor& x,
   out->set_dtype(x.dtype());
 }
 
+void AddGroupNormSiluInferMeta(const MetaTensor& x,
+                               const MetaTensor& residual,
+                               const MetaTensor& scale,
+                               const MetaTensor& bias,
+                               float epsilon,
+                               int groups,
+                               const std::string& data_layout_str,
+                               const std::string& activation,
+                               MetaTensor* y,
+                               MetaTensor* residual_out,
+                               MetaTensor* mean,
+                               MetaTensor* variance) {
+  PADDLE_ENFORCE_NE(y,
+                    nullptr,
+                    phi::errors::InvalidArgument(
+                        "The y in GroupNormInferMeta can't be nullptr."));
+  PADDLE_ENFORCE_NE(mean,
+                    nullptr,
+                    phi::errors::InvalidArgument(
+                        "The mean in GroupNormInferMeta can't be nullptr."));
+  PADDLE_ENFORCE_NE(
+      variance,
+      nullptr,
+      phi::errors::InvalidArgument(
+          "The variance in GroupNormInferMeta can't be nullptr."));
+
+  auto x_dim = x.dims();
+  PADDLE_ENFORCE_GE(
+      x_dim.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "The Input(X)'s dimension of Op(group_norm) must be "
+          "greater than 1. But received: %u-D Tensor, which shape is [%s].",
+          x_dim.size(),
+          x_dim));
+
+  const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
+  const int64_t channel_num =
+      (data_layout == DataLayout::kNCHW ? x_dim[1] : x_dim[x_dim.size() - 1]);
+  auto batch_size = x_dim[0];
+  PADDLE_ENFORCE_LE(
+      groups,
+      channel_num,
+      phi::errors::InvalidArgument(
+          "The Attr(groups) of Op(group_norm) must be less than or "
+          "equal to the number of channels. But received: groups "
+          "is [%s], channels is [%s], the Attr(data_layout) "
+          "is [%s]. The error may come from wrong data_layout setting.",
+          groups,
+          channel_num,
+          data_layout_str));
+  PADDLE_ENFORCE_GE(
+      groups,
+      1,
+      phi::errors::InvalidArgument(
+          "The Attr(groups) of Op(group_norm) must be "
+          "greater than or equal to 1. But received: groups is [%s].",
+          groups));
+  PADDLE_ENFORCE_EQ(
+      channel_num % groups,
+      0,
+      phi::errors::InvalidArgument(
+          "Expected number of channels in input to be divisible by "
+          "num_groups, but got input channel is %d and num_groups is %d",
+          channel_num,
+          groups));
+
+  if (scale) {
+    PADDLE_ENFORCE_EQ(
+        scale.dims().size(),
+        1UL,
+        phi::errors::InvalidArgument(
+            "The Input(Scale) of Op(group_norm) should be 1-D Tensor. "
+            "But received: %u-D Tensor, the shape of Input(Scale) is [%s].",
+            scale.dims().size(),
+            scale.dims()));
+    PADDLE_ENFORCE_EQ(
+        scale.dims()[0],
+        channel_num,
+        phi::errors::InvalidArgument(
+            "The Input(Scale)'s first dimension size of Op(group_norm) must "
+            "be equal to the number of channels. But received: the "
+            "Input(Scale)'s first dimension size is [%s], the channels is "
+            "[%s], the Attr(data_layout) is [%s]. The error may come "
+            "from wrong data_layout setting.",
+            scale.dims()[0],
+            channel_num,
+            data_layout_str));
+  }
+  if (bias) {
+    PADDLE_ENFORCE_EQ(
+        bias.dims().size(),
+        1UL,
+        phi::errors::InvalidArgument(
+            "The Input(Bias) of Op(group_norm) should be 1-D Tensor. "
+            "But received: %u-D Tensor, the shape of Input(Bias) is [%s].",
+            bias.dims().size(),
+            bias.dims()));
+    PADDLE_ENFORCE_EQ(
+        bias.dims()[0],
+        channel_num,
+        phi::errors::InvalidArgument(
+            "The Input(Bias)'s first dimension size of "
+            "Op(group_norm) must be equal to the number of channels. "
+            "But received: the Input(Bias)'s first dimension size is [%s], "
+            "the channels is [%s], the Attr(data_layout) is [%s]. The "
+            "error may come from wrong data_layout setting.",
+            bias.dims()[0],
+            channel_num,
+            data_layout_str));
+  }
+  y->set_dims(x_dim);
+  y->set_dtype(x.dtype());
+  y->share_lod(x);
+
+  phi::DataType x_dtype = x.dtype();
+  phi::DataType param_type =
+      (x_dtype == phi::DataType::BFLOAT16 || x_dtype == phi::DataType::FLOAT16)
+          ? phi::DataType::FLOAT32
+          : x_dtype;
+  if (mean) {
+    mean->set_dims({batch_size, groups});
+    mean->set_dtype(param_type);
+  }
+  if (variance) {
+    variance->set_dims({batch_size, groups});
+    variance->set_dtype(param_type);
+  }
+  if (residual_out) {
+    residual_out->set_dims(x_dim);
+    residual_out->set_dtype(x.dtype());
+    residual_out->share_lod(x);
+  }
+}
+
 void GroupNormInferMeta(const MetaTensor& x,
                         const MetaTensor& scale,
                         const MetaTensor& bias,
diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h
index 8732a87c55cd6..1b276846619e6 100644
--- a/paddle/phi/infermeta/ternary.h
+++ b/paddle/phi/infermeta/ternary.h
@@ -144,6 +144,19 @@ void GlobalScatterInferMeta(const MetaTensor& x,
                             bool use_calc_stream,
                             MetaTensor* out);
 
+void AddGroupNormSiluInferMeta(const MetaTensor& x,
+                               const MetaTensor& residual,
+                               const MetaTensor& scale,
+                               const MetaTensor& bias,
+                               float epsilon,
+                               int groups,
+                               const std::string& data_layout,
+                               const std::string& activation,
+                               MetaTensor* y,
+                               MetaTensor* residual_out,
+                               MetaTensor* mean,
+                               MetaTensor* variance);
+
 void GroupNormInferMeta(const MetaTensor& x,
                         const MetaTensor& scale,
                         const MetaTensor& bias,
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index 0aca647dd6a49..96d34a0157e8d 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -55,6 +55,21 @@ if(DEFINED REDUCE_INFERENCE_LIB_SIZE)
 endif()
 
 if(WITH_CUTLASS)
+  add_custom_target(
+    gemm_epilogue_compile_script ALL
+    COMMAND bash compile.sh "${PYTHON_EXECUTABLE}" "${CUDA_TOOLKIT_ROOT_DIR}"
+            \"${NVCC_ARCH_BIN}\" "${CMAKE_COMMAND}"
+    WORKING_DIRECTORY
+      ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/gemm_epilogue
+    COMMENT "GemmEpilogue compile script")
+  add_custom_target(
+    fused_conv2d_add_act_compile_script ALL
+    COMMAND bash compile.sh "${PYTHON_EXECUTABLE}" "${CUDA_TOOLKIT_ROOT_DIR}"
+            \"${NVCC_ARCH_BIN}\" "${CMAKE_COMMAND}"
+    WORKING_DIRECTORY
+      ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/conv2d
+    COMMENT "FusedConv2dAddAct compile script")
+
   execute_process(
     COMMAND
       ${PYTHON_EXECUTABLE}
diff --git a/paddle/phi/kernels/cpu/cumprod_kernel.cc b/paddle/phi/kernels/cpu/cumprod_kernel.cc
index f39bddbb443ba..422f566c6612e 100644
--- a/paddle/phi/kernels/cpu/cumprod_kernel.cc
+++ b/paddle/phi/kernels/cpu/cumprod_kernel.cc
@@ -32,8 +32,16 @@ void CumprodKernel(const Context& dev_ctx,
                    DenseTensor* out) {
   const DenseTensor* x = &input;
   auto* x_data = x->data<T>();
-  auto* out_data = dev_ctx.template Alloc<T>(out);
+  auto* out_ptr = dev_ctx.template Alloc<T>(out);
   DDim shape = x->dims();
+  DenseTensor out_tmp;
+  T* out_data = nullptr;
+  if (x_data == out_ptr) {
+    out_tmp.Resize(shape);
+    out_data = dev_ctx.template Alloc<T>(&out_tmp);
+  } else {
+    out_data = out_ptr;
+  }
 
   size_t outer_dim = 1;
   size_t mid_dim = 1;
@@ -88,6 +96,9 @@ void CumprodKernel(const Context& dev_ctx,
       }
     }
   }
+  if (x_data == out_ptr) {
+    memcpy(out_ptr, out_data, out->numel() * sizeof(T));
+  }
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/cpu/inverse_grad_kernel.cc b/paddle/phi/kernels/cpu/inverse_grad_kernel.cc
index 97c10e69c8eab..5014cfd0f95c7 100644
--- a/paddle/phi/kernels/cpu/inverse_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/inverse_grad_kernel.cc
@@ -16,5 +16,11 @@
 
 #include "paddle/phi/core/kernel_registry.h"
 
-PD_REGISTER_KERNEL(
-    inverse_grad, CPU, ALL_LAYOUT, phi::InverseGradKernel, float, double) {}
+PD_REGISTER_KERNEL(inverse_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::InverseGradKernel,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/inverse_kernel.cc b/paddle/phi/kernels/cpu/inverse_kernel.cc
index 4b21718eca3f2..6fecef6f888dc 100644
--- a/paddle/phi/kernels/cpu/inverse_kernel.cc
+++ b/paddle/phi/kernels/cpu/inverse_kernel.cc
@@ -16,5 +16,11 @@
 
 #include "paddle/phi/core/kernel_registry.h"
 
-PD_REGISTER_KERNEL(
-    inverse, CPU, ALL_LAYOUT, phi::InverseKernel, float, double) {}
+PD_REGISTER_KERNEL(inverse,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::InverseKernel,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/fluid/operators/ops_signature/number_count_sig.cc b/paddle/phi/kernels/cpu/lstm_grad_kernel.cc
similarity index 58%
rename from paddle/fluid/operators/ops_signature/number_count_sig.cc
rename to paddle/phi/kernels/cpu/lstm_grad_kernel.cc
index 48e0b4fce9ac1..ddaa85c8bdce1 100644
--- a/paddle/fluid/operators/ops_signature/number_count_sig.cc
+++ b/paddle/phi/kernels/cpu/lstm_grad_kernel.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,15 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/core/compat/op_utils.h"
+#include <memory>
+#include <string>
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/lstm_kernel_impl.h"
 
-namespace phi {
-
-KernelSignature NumberCountOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature("number_count", {"numbers"}, {"upper_range"}, {"Out"});
-}
-
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(number_count, phi::NumberCountOpArgumentMapping);
+PD_REGISTER_KERNEL(
+    lstm_grad, CPU, ALL_LAYOUT, phi::LSTMGradKernel, float, double) {}
diff --git a/paddle/fluid/operators/ops_signature/channel_shuffle_sig.cc b/paddle/phi/kernels/cpu/lstm_kernel.cc
similarity index 50%
rename from paddle/fluid/operators/ops_signature/channel_shuffle_sig.cc
rename to paddle/phi/kernels/cpu/lstm_kernel.cc
index d3bf58bdec3c8..848ba68bb3b76 100644
--- a/paddle/fluid/operators/ops_signature/channel_shuffle_sig.cc
+++ b/paddle/phi/kernels/cpu/lstm_kernel.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,19 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/core/compat/op_utils.h"
+#include <memory>
+#include <string>
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/lstm_kernel_impl.h"
 
-namespace phi {
-
-KernelSignature ChannelShuffleGradOpArgumentMapping(
-    const ArgumentMappingContext& ctx UNUSED) {
-  return KernelSignature("channel_shuffle_grad",
-                         {"Out@GRAD"},
-                         {"groups", "data_format"},
-                         {"X@GRAD"});
-}
-
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(channel_shuffle_grad,
-                           phi::ChannelShuffleGradOpArgumentMapping);
+PD_REGISTER_KERNEL(lstm, CPU, ALL_LAYOUT, phi::LSTMKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/meshgrid_grad_kernel.cc b/paddle/phi/kernels/cpu/meshgrid_grad_kernel.cc
index 5b43fb02b5117..9d1319e0b5e4a 100644
--- a/paddle/phi/kernels/cpu/meshgrid_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/meshgrid_grad_kernel.cc
@@ -25,4 +25,6 @@ PD_REGISTER_KERNEL(meshgrid_grad,
                    float,
                    double,
                    int,
-                   int64_t) {}
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/meshgrid_kernel.cc b/paddle/phi/kernels/cpu/meshgrid_kernel.cc
index 35e43f7bbc85e..a0239da6bb128 100644
--- a/paddle/phi/kernels/cpu/meshgrid_kernel.cc
+++ b/paddle/phi/kernels/cpu/meshgrid_kernel.cc
@@ -25,4 +25,6 @@ PD_REGISTER_KERNEL(meshgrid,
                    float,
                    double,
                    int,
-                   int64_t) {}
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/tdm_child_kernel.cc b/paddle/phi/kernels/cpu/tdm_child_kernel.cc
index 246f2113d65e8..3fabbba572f7e 100644
--- a/paddle/phi/kernels/cpu/tdm_child_kernel.cc
+++ b/paddle/phi/kernels/cpu/tdm_child_kernel.cc
@@ -104,7 +104,7 @@ void TDMChildKernel(const Context &dev_ctx,
                     const phi::DenseTensor &x,
                     const phi::DenseTensor &tree_info,
                     int child_nums,
-                    int dtype,
+                    phi::DataType dtype,
                     phi::DenseTensor *child,
                     phi::DenseTensor *leaf_mask) {
   const auto &input_type = x.dtype();
@@ -132,7 +132,7 @@ void TDMChildKernel(const Context &dev_ctx,
           DataTypeToString(DataType::INT32),
           DataTypeToString(DataType::INT64)));
 
-  auto output_type = phi::TransToPhiDataType(dtype);
+  auto output_type = dtype;
   bool out_type_match =
       output_type == DataType::INT32 || output_type == DataType::INT64;
   PADDLE_ENFORCE_EQ(out_type_match,
diff --git a/paddle/phi/kernels/cpu/tile_kernel.cc b/paddle/phi/kernels/cpu/tile_kernel.cc
index 2320c30310a64..30eb1d5cd6c47 100644
--- a/paddle/phi/kernels/cpu/tile_kernel.cc
+++ b/paddle/phi/kernels/cpu/tile_kernel.cc
@@ -27,5 +27,6 @@ PD_REGISTER_KERNEL(tile,
                    double,
                    int,
                    int64_t,
+                   phi::dtype::float16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
index 96b2128eee16c..a58b5998a6703 100644
--- a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
@@ -685,6 +685,63 @@ struct CUBlas<phi::dtype::complex<float>> {
         ldb,
         batch_size));
   }
+
+  static void GETRF_BATCH(cublasHandle_t handle,
+                          int n,
+                          phi::dtype::complex<float> **A,
+                          int lda,
+                          int *ipiv,
+                          int *info,
+                          int batch_size) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasCgetrfBatched(
+        handle,
+        n,
+        reinterpret_cast<cuFloatComplex **>(A),
+        lda,
+        ipiv,
+        info,
+        batch_size));
+  }
+
+  static void GETRI_BATCH(cublasHandle_t handle,
+                          int n,
+                          const phi::dtype::complex<float> **A,
+                          int lda,
+                          const int *ipiv,
+                          phi::dtype::complex<float> **Ainv,
+                          int ldc,
+                          int *info,
+                          int batch_size) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasCgetriBatched(
+        handle,
+        n,
+        reinterpret_cast<const cuFloatComplex **>(A),
+        lda,
+        ipiv,
+        reinterpret_cast<cuFloatComplex **>(Ainv),
+        ldc,
+        info,
+        batch_size));
+  }
+
+  static void MATINV_BATCH(cublasHandle_t handle,
+                           int n,
+                           const phi::dtype::complex<float> **A,
+                           int lda,
+                           phi::dtype::complex<float> **Ainv,
+                           int lda_inv,
+                           int *info,
+                           int batch_size) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasCmatinvBatched(
+        handle,
+        n,
+        reinterpret_cast<const cuFloatComplex **>(A),
+        lda,
+        reinterpret_cast<cuFloatComplex **>(Ainv),
+        lda_inv,
+        info,
+        batch_size));
+  }
 };
 
 template <>
@@ -923,6 +980,63 @@ struct CUBlas<phi::dtype::complex<double>> {
         "cublasGemmEx is not supported on cuda <= 7.5"));
 #endif
   }
+
+  static void GETRF_BATCH(cublasHandle_t handle,
+                          int n,
+                          phi::dtype::complex<double> **A,
+                          int lda,
+                          int *ipiv,
+                          int *info,
+                          int batch_size) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasZgetrfBatched(
+        handle,
+        n,
+        reinterpret_cast<cuDoubleComplex **>(A),
+        lda,
+        ipiv,
+        info,
+        batch_size));
+  }
+
+  static void GETRI_BATCH(cublasHandle_t handle,
+                          int n,
+                          const phi::dtype::complex<double> **A,
+                          int lda,
+                          const int *ipiv,
+                          phi::dtype::complex<double> **Ainv,
+                          int ldc,
+                          int *info,
+                          int batch_size) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasZgetriBatched(
+        handle,
+        n,
+        reinterpret_cast<const cuDoubleComplex **>(A),
+        lda,
+        ipiv,
+        reinterpret_cast<cuDoubleComplex **>(Ainv),
+        ldc,
+        info,
+        batch_size));
+  }
+
+  static void MATINV_BATCH(cublasHandle_t handle,
+                           int n,
+                           const phi::dtype::complex<double> **A,
+                           int lda,
+                           phi::dtype::complex<double> **Ainv,
+                           int lda_inv,
+                           int *info,
+                           int batch_size) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasZmatinvBatched(
+        handle,
+        n,
+        reinterpret_cast<const cuDoubleComplex **>(A),
+        lda,
+        reinterpret_cast<cuDoubleComplex **>(Ainv),
+        lda_inv,
+        info,
+        batch_size));
+  }
 };
 
 template <>
diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cc b/paddle/phi/kernels/funcs/concat_and_split_functor.cc
index fd49748666a6e..c42bbbd3a5318 100644
--- a/paddle/phi/kernels/funcs/concat_and_split_functor.cc
+++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cc
@@ -14,8 +14,7 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 
-namespace phi {
-namespace funcs {
+namespace phi::funcs {
 
 /*
  * All tensors' dimension should be the same and the values of
@@ -132,5 +131,4 @@ struct SplitFunctor<phi::CPUContext, T> {
 
 FOR_ALL_TYPES(DEFINE_FUNCTOR);
 
-}  // namespace funcs
-}  // namespace phi
+}  // namespace phi::funcs
diff --git a/paddle/phi/kernels/funcs/lstm_utils.h b/paddle/phi/kernels/funcs/lstm_utils.h
new file mode 100644
index 0000000000000..4a02b097fd340
--- /dev/null
+++ b/paddle/phi/kernels/funcs/lstm_utils.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/mixed_vector.h"
+#include "paddle/phi/kernels/funcs/detail/activation_functions.h"
+#include "paddle/phi/kernels/funcs/sequence2batch.h"
+
+namespace phi {
+
+template <typename Context, typename T>
+inline void ReorderInitState(const Context& dev_ctx,
+                             const phi::DenseTensor& src,
+                             phi::Vector<size_t> index_lod,
+                             phi::DenseTensor* dst,
+                             bool indexed_src) {
+  phi::funcs::CopyMatrixRowsFunctor<Context, T> row_shuffle;
+  dst->Resize(src.dims());
+  dev_ctx.template Alloc<T>(dst);
+  row_shuffle(dev_ctx, src, index_lod, dst, indexed_src);
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cc b/paddle/phi/kernels/funcs/matrix_inverse.cc
index c316970e6a560..2a3749ef36b81 100644
--- a/paddle/phi/kernels/funcs/matrix_inverse.cc
+++ b/paddle/phi/kernels/funcs/matrix_inverse.cc
@@ -16,8 +16,7 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 
-namespace phi {
-namespace funcs {
+namespace phi::funcs {
 
 template <typename Context, typename T>
 void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
@@ -28,6 +27,7 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
 
 template class MatrixInverseFunctor<CPUContext, float>;
 template class MatrixInverseFunctor<CPUContext, double>;
+template class MatrixInverseFunctor<CPUContext, phi::dtype::complex<float>>;
+template class MatrixInverseFunctor<CPUContext, phi::dtype::complex<double>>;
 
-}  // namespace funcs
-}  // namespace phi
+}  // namespace phi::funcs
diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu b/paddle/phi/kernels/funcs/matrix_inverse.cu
index c0ea7ad84c41b..f46dd714c9f55 100644
--- a/paddle/phi/kernels/funcs/matrix_inverse.cu
+++ b/paddle/phi/kernels/funcs/matrix_inverse.cu
@@ -131,6 +131,8 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
 
 template class MatrixInverseFunctor<GPUContext, float>;
 template class MatrixInverseFunctor<GPUContext, double>;
+template class MatrixInverseFunctor<GPUContext, phi::dtype::complex<float>>;
+template class MatrixInverseFunctor<GPUContext, phi::dtype::complex<double>>;
 
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/matrix_inverse.h b/paddle/phi/kernels/funcs/matrix_inverse.h
index f0cd265a54648..d45f7d8863a63 100644
--- a/paddle/phi/kernels/funcs/matrix_inverse.h
+++ b/paddle/phi/kernels/funcs/matrix_inverse.h
@@ -25,14 +25,69 @@ limitations under the License. */
 namespace phi {
 namespace funcs {
 
+template <typename Context, typename T>
+struct MapMatrixInverseFunctor {
+  void operator()(
+      const Context& dev_ctx, const T* a_ptr, T* a_inv_ptr, int offset, int n) {
+    using Matrix =
+        Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+    using EigenMatrixMap = Eigen::Map<Matrix>;
+    using ConstEigenMatrixMap = Eigen::Map<const Matrix>;
+
+    ConstEigenMatrixMap mat(a_ptr + offset, n, n);
+    EigenMatrixMap mat_inv(a_inv_ptr + offset, n, n);
+    Eigen::PartialPivLU<Matrix> lu;
+    lu.compute(mat);
+
+    const T min_abs_pivot = lu.matrixLU().diagonal().cwiseAbs().minCoeff();
+    PADDLE_ENFORCE_GT(min_abs_pivot,
+                      static_cast<T>(0),
+                      errors::InvalidArgument("Input is not invertible."));
+    mat_inv.noalias() = lu.inverse();
+  }
+};
+
+template <typename Context, typename T>
+struct MapMatrixInverseFunctor<Context, phi::dtype::complex<T>> {
+  void operator()(const Context& dev_ctx,
+                  const phi::dtype::complex<T>* a_ptr,
+                  phi::dtype::complex<T>* a_inv_ptr,
+                  int offset,
+                  int n) {
+    using Matrix = Eigen::Matrix<std::complex<T>,
+                                 Eigen::Dynamic,
+                                 Eigen::Dynamic,
+                                 Eigen::RowMajor>;
+    using EigenMatrixMap = Eigen::Map<Matrix>;
+    using ConstEigenMatrixMap = Eigen::Map<const Matrix>;
+    std::complex<T>* std_ptr = new std::complex<T>[n * n];
+    std::complex<T>* std_inv_ptr = new std::complex<T>[n * n];
+    for (int i = 0; i < n * n; i++) {
+      *(std_ptr + i) = static_cast<std::complex<T>>(*(a_ptr + offset + i));
+    }
+    ConstEigenMatrixMap mat(std_ptr, n, n);
+    EigenMatrixMap mat_inv(std_inv_ptr, n, n);
+    Eigen::PartialPivLU<Matrix> lu;
+    lu.compute(mat);
+
+    const T min_abs_pivot = lu.matrixLU().diagonal().cwiseAbs().minCoeff();
+    PADDLE_ENFORCE_NE(min_abs_pivot,
+                      static_cast<std::complex<T>>(0),
+                      errors::InvalidArgument("Input is not invertible."));
+    mat_inv.noalias() = lu.inverse();
+    for (int i = 0; i < n * n; i++) {
+      *(a_inv_ptr + offset + i) =
+          static_cast<phi::dtype::complex<T>>(*(std_inv_ptr + i));
+    }
+    delete[] std_ptr;
+    delete[] std_inv_ptr;
+  }
+};
+
 template <typename Context, typename T>
 void ComputeInverseEigen(const Context& dev_ctx,
                          const DenseTensor& a,
                          DenseTensor* a_inv) {
-  using Matrix =
-      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
-  using EigenMatrixMap = Eigen::Map<Matrix>;
-  using ConstEigenMatrixMap = Eigen::Map<const Matrix>;
   const auto& mat_dims = a.dims();
   const int rank = mat_dims.size();
   int n = mat_dims[rank - 1];
@@ -41,17 +96,13 @@ void ComputeInverseEigen(const Context& dev_ctx,
   const T* a_ptr = a.data<T>();
   T* a_inv_ptr = dev_ctx.template Alloc<T>(a_inv);
 
+  // Putting phi::dtype::complex into eigen::matrix has a problem,
+  // it's not going to get the right result,
+  // so we're going to convert it to std::complex and
+  // then we're going to put it into eigen::matrix.
   for (int i = 0; i < batch_size; ++i) {
-    ConstEigenMatrixMap mat(a_ptr + i * n * n, n, n);
-    EigenMatrixMap mat_inv(a_inv_ptr + i * n * n, n, n);
-    Eigen::PartialPivLU<Matrix> lu;
-    lu.compute(mat);
-
-    const T min_abs_pivot = lu.matrixLU().diagonal().cwiseAbs().minCoeff();
-    PADDLE_ENFORCE_GT(min_abs_pivot,
-                      static_cast<T>(0),
-                      errors::InvalidArgument("Input is not invertible."));
-    mat_inv.noalias() = lu.inverse();
+    MapMatrixInverseFunctor<Context, T> functor;
+    functor(dev_ctx, a_ptr, a_inv_ptr, i * n * n, n);
   }
 }
 
diff --git a/paddle/phi/kernels/funcs/sequence_pooling.cc b/paddle/phi/kernels/funcs/sequence_pooling.cc
index f4ee9c323366e..1fdaadfea01a1 100644
--- a/paddle/phi/kernels/funcs/sequence_pooling.cc
+++ b/paddle/phi/kernels/funcs/sequence_pooling.cc
@@ -21,8 +21,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/jit/kernels.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-namespace phi {
-namespace funcs {
+namespace phi::funcs {
 
 template <typename T,
           int MajorType = Eigen::RowMajor,
@@ -498,5 +497,4 @@ template class SequencePoolFunctor<phi::CPUContext, double>;
 template class SequencePoolGradFunctor<phi::CPUContext, float>;
 template class SequencePoolGradFunctor<phi::CPUContext, double>;
 
-}  // namespace funcs
-}  // namespace phi
+}  // namespace phi::funcs
diff --git a/paddle/phi/kernels/fusion/cpu/fusion_seqconv_eltadd_relu_kernel.cc b/paddle/phi/kernels/fusion/cpu/fusion_seqconv_eltadd_relu_kernel.cc
index 4ff18849316d8..456d3370990cb 100644
--- a/paddle/phi/kernels/fusion/cpu/fusion_seqconv_eltadd_relu_kernel.cc
+++ b/paddle/phi/kernels/fusion/cpu/fusion_seqconv_eltadd_relu_kernel.cc
@@ -23,8 +23,7 @@
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/fc_functor.h"
 
-namespace phi {
-namespace fusion {
+namespace phi::fusion {
 
 template <typename T, typename Context>
 void FusionSeqConvEltAddReluKernel(const Context& dev_ctx,
@@ -148,8 +147,7 @@ void FusionSeqConvEltAddReluKernel(const Context& dev_ctx,
      true);
 }
 
-}  // namespace fusion
-}  // namespace phi
+}  // namespace phi::fusion
 
 PD_REGISTER_KERNEL(fusion_seqconv_eltadd_relu,
                    CPU,
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/CMakeLists.txt b/paddle/phi/kernels/fusion/cutlass/conv2d/CMakeLists.txt
index d760ce773c135..abcf220aa5c54 100644
--- a/paddle/phi/kernels/fusion/cutlass/conv2d/CMakeLists.txt
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.23)
+cmake_minimum_required(VERSION 3.18)
 
 if(NOT DEFINED PYTHON_EXECUTABLE)
   message(
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/compile.sh b/paddle/phi/kernels/fusion/cutlass/conv2d/compile.sh
index eb13c7dd6723d..8ac34b55144df 100644
--- a/paddle/phi/kernels/fusion/cutlass/conv2d/compile.sh
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/compile.sh
@@ -13,21 +13,38 @@
 # limitations under the License.
 set -e
 
-cutlass_repo_directory="cutlass"
-if [ ! -d "$cutlass_repo_directory" ]; then
-     git clone --branch v3.0.0  https://github.com/NVIDIA/cutlass
-fi
-
 build_directory="build"
 if [ ! -d "$build_directory" ]; then
     mkdir $build_directory
 fi
 
-python_exe_path="python"
-cuda_root_path="/usr/local/cuda"
-gpu_cc="80"
+libname="$build_directory/libCutlassConv2d.so"
+if [ -e "$libname" ]; then
+    exit 0 
+fi
+
+default_python_exe_path="/usr/bin/python"
+default_cuda_root_path="/usr/local/cuda"
+default_gpu_cc="80"
+default_cmake_command="cmake"
+ 
+python_exe_path="${1:-$default_python_exe_path}"  
+cuda_root_path="${2:-$default_cuda_root_path}"  
+gpu_cc="${3:-$default_gpu_cc}"
+cmake_command="${4:-$default_cmake_command}" 
+
+case "$gpu_cc" in  
+    75|80|86|89)  ;;  
+    *)  exit 0  ;;  
+esac
+
+cutlass_repo_directory="cutlass"
+if [ ! -d "$cutlass_repo_directory" ]; then
+    git clone --branch v3.0.0  https://github.com/NVIDIA/cutlass
+fi
+
 
 cd $build_directory
-cmake .. -DPYTHON_EXECUTABLE=$python_exe_path -DCUDA_TOOLKIT_ROOT_DIR=$cuda_root_path -DCOMPUTE_CAPABILITY=$gpu_cc
-make -j 
+$cmake_command .. -DPYTHON_EXECUTABLE=$python_exe_path -DCUDA_TOOLKIT_ROOT_DIR=$cuda_root_path -DCOMPUTE_CAPABILITY=$gpu_cc
+make -j8
 cd -
diff --git a/paddle/phi/kernels/fusion/cutlass/gemm_epilogue/CMakeLists.txt b/paddle/phi/kernels/fusion/cutlass/gemm_epilogue/CMakeLists.txt
index 6ad5035e9dcd6..fc9cfa1cfd919 100644
--- a/paddle/phi/kernels/fusion/cutlass/gemm_epilogue/CMakeLists.txt
+++ b/paddle/phi/kernels/fusion/cutlass/gemm_epilogue/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.23)
+cmake_minimum_required(VERSION 3.18)
 
 if(NOT DEFINED PYTHON_EXECUTABLE)
   message(
diff --git a/paddle/phi/kernels/fusion/cutlass/gemm_epilogue/compile.sh b/paddle/phi/kernels/fusion/cutlass/gemm_epilogue/compile.sh
index f8a5463239a95..4352cb6381354 100644
--- a/paddle/phi/kernels/fusion/cutlass/gemm_epilogue/compile.sh
+++ b/paddle/phi/kernels/fusion/cutlass/gemm_epilogue/compile.sh
@@ -13,21 +13,38 @@
 # limitations under the License.
 set -e
 
-cutlass_repo_directory="cutlass"
-if [ ! -d "$cutlass_repo_directory" ]; then
-    git clone --branch v2.11.0  https://github.com/NVIDIA/cutlass
-fi
-
 build_directory="build"
 if [ ! -d "$build_directory" ]; then
     mkdir $build_directory
 fi
 
-python_exe_path="/usr/bin/python"
-cuda_root_path="/usr/local/cuda"
-gpu_cc="80"
+libname="$build_directory/libCutlassGemmEpilogue.so"
+if [ -e "$libname" ]; then
+    exit 0 
+fi
+
+default_python_exe_path="/usr/bin/python"
+default_cuda_root_path="/usr/local/cuda"
+default_gpu_cc="80"
+default_cmake_command="cmake"
+ 
+python_exe_path="${1:-$default_python_exe_path}"  
+cuda_root_path="${2:-$default_cuda_root_path}"  
+gpu_cc="${3:-$default_gpu_cc}"
+cmake_command="${4:-$default_cmake_command}" 
+
+case "$gpu_cc" in  
+    80|86|89)  ;;  
+    *)  exit 0  ;;  
+esac
+
+cutlass_repo_directory="cutlass"
+if [ ! -d "$cutlass_repo_directory" ]; then
+    git clone --branch v2.11.0  https://github.com/NVIDIA/cutlass
+fi
+
 
 cd $build_directory
-cmake .. -DPYTHON_EXECUTABLE=$python_exe_path -DCUDA_TOOLKIT_ROOT_DIR=$cuda_root_path -DCOMPUTE_CAPABILITY=$gpu_cc
-make -j 
+$cmake_command .. -DPYTHON_EXECUTABLE=$python_exe_path -DCUDA_TOOLKIT_ROOT_DIR=$cuda_root_path -DCOMPUTE_CAPABILITY=$gpu_cc
+make -j8
 cd -
diff --git a/paddle/phi/kernels/fusion/cutlass/gemm_epilogue/gemm_epilogue_util.h b/paddle/phi/kernels/fusion/cutlass/gemm_epilogue/gemm_epilogue_util.h
index 8f1be5983f646..8b36a43fdf843 100644
--- a/paddle/phi/kernels/fusion/cutlass/gemm_epilogue/gemm_epilogue_util.h
+++ b/paddle/phi/kernels/fusion/cutlass/gemm_epilogue/gemm_epilogue_util.h
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #pragma once
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
 #include <vector>
 
 #include "paddle/phi/kernels/fusion/cutlass/gemm_epilogue/gemm_epilogue_decl.h"
diff --git a/paddle/phi/kernels/fusion/xpu/block_multi_head_attention_kernel.cc b/paddle/phi/kernels/fusion/xpu/block_multi_head_attention_kernel.cc
new file mode 100644
index 0000000000000..b38a0b1c00dc2
--- /dev/null
+++ b/paddle/phi/kernels/fusion/xpu/block_multi_head_attention_kernel.cc
@@ -0,0 +1,606 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <paddle/phi/backends/xpu/xpu_context.h>
+#include "glog/logging.h"
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/common/memory_utils.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/flash_attn_kernel.h"
+#include "xpu/xdnn.h"
+
+namespace phi {
+namespace fusion {
+
+template <typename Context>
+int GetMaxLen(const Context& dev_ctx,
+              const phi::DenseTensor& seq_lens_tensor,
+              phi::DenseTensor* max_len_tensor,
+              const int batch_size) {
+  int max_len_cpu = 0;
+  int r = baidu::xpu::api::reduce_max<int>(dev_ctx.x_context(),
+                                           seq_lens_tensor.data<int>(),
+                                           max_len_tensor->data<int>(),
+                                           {batch_size},
+                                           {0});
+  PD_CHECK(r == 0, "baidu::xpu::api::reduce_max failed.");
+  xpu_wait(dev_ctx.x_context()->xpu_stream);
+  r = xpu_memcpy(&max_len_cpu,
+                 max_len_tensor->data<int>(),
+                 sizeof(int),
+                 XPUMemcpyKind::XPU_DEVICE_TO_HOST);
+  PD_CHECK(r == 0, "xpu_memcpy failed.");
+  return max_len_cpu;
+}
+
+template <typename T, typename Context>
+void qkv_split_rope_kernel(const Context& xpu_ctx,
+                           const DenseTensor& qkv_input,
+                           const DenseTensor& rotary_emb,
+                           const DenseTensor& seq_lens,
+                           const baidu::xpu::api::VectorParam<int32_t>& lods,
+                           int bsz,
+                           int max_seq_len,
+                           int token_num,
+                           int num_head,
+                           int dim_head,
+                           DenseTensor* q_out,
+                           DenseTensor* k_out,
+                           DenseTensor* v_out) {
+  xpu::ctx_guard RAII_GUARD(xpu_ctx.x_context());
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  auto q_data = reinterpret_cast<XPUType*>(q_out->data<T>());
+  auto k_data = reinterpret_cast<XPUType*>(k_out->data<T>());
+  auto v_data = reinterpret_cast<XPUType*>(v_out->data<T>());
+  int r = baidu::xpu::api::split<XPUType>(
+      xpu_ctx.x_context(),
+      reinterpret_cast<const XPUType*>(qkv_input.data<T>()),
+      {q_data, k_data, v_data},
+      {token_num, 3, num_head * dim_head},
+      {1, 1, 1},
+      1);
+  const_cast<DenseTensor*>(&qkv_input)->clear();
+  PD_CHECK(r == 0, "baidu::xpu::api::split failed.");
+  r = baidu::xpu::api::vsl_rotary_neox_embedding<XPUType, float, int32_t>(
+      xpu_ctx.x_context(),
+      q_data,
+      k_data,
+      rotary_emb.data<float>(),
+      q_data,
+      k_data,
+      lods,
+      1,
+      max_seq_len,
+      num_head,
+      dim_head,
+      "BLHD",
+      {},
+      "NORMAL",
+      -1);
+  PD_CHECK(r == 0, "baidu::xpu::api::vsl_rotary_neox_embedding failed.");
+}
+
+template <typename T, typename Context>
+void BlockMultiheadAttentionXPUKernel(
+    const Context& dev_ctx,
+    const DenseTensor& qkv,
+    const DenseTensor& key_cache,
+    const DenseTensor& value_cache,
+    const DenseTensor& seq_lens_encoder,
+    const DenseTensor& seq_lens_decoder,
+    const DenseTensor& seq_lens_this_time,
+    const DenseTensor& padding_offsets,
+    const DenseTensor& cum_offsets,
+    const DenseTensor& cu_seqlens_q,
+    const DenseTensor& cu_seqlens_k,
+    const DenseTensor& block_tables,
+    const DenseTensor& cache_k_per_batch_maxs,
+    const DenseTensor& cache_v_per_batch_maxs,
+    const paddle::optional<DenseTensor>& pre_key_cache,
+    const paddle::optional<DenseTensor>& pre_value_cache,
+    const paddle::optional<DenseTensor>& rope_emb,
+    const paddle::optional<DenseTensor>& mask,
+    const paddle::optional<DenseTensor>& tgt_mask,
+    const paddle::optional<DenseTensor>& cache_k_quant_scales,
+    const paddle::optional<DenseTensor>& cache_v_quant_scales,
+    const paddle::optional<DenseTensor>& cache_k_dequant_scales,
+    const paddle::optional<DenseTensor>& cache_v_dequant_scales,
+    const paddle::optional<DenseTensor>& qkv_out_scale,
+    const paddle::optional<DenseTensor>& qkv_bias,
+    const paddle::optional<DenseTensor>& out_shift,
+    const paddle::optional<DenseTensor>& out_smooth,
+    const paddle::optional<DenseTensor>& max_enc_len_this_time,
+    const paddle::optional<DenseTensor>& max_dec_len_this_time,
+    int max_seq_len,
+    int block_size,
+    bool use_neox_style,
+    const bool dynamic_cachekv_quant,
+    const int quant_round_type,
+    const float quant_max_bound,
+    const float quant_min_bound,
+    const float out_scale,
+    const std::string& compute_dtype,
+    DenseTensor* fmha_out,
+    DenseTensor* qkv_out,
+    DenseTensor* key_cache_out,
+    DenseTensor* value_cache_out) {
+  xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+  auto xpu_context = dev_ctx.x_context();
+
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
+  phi::DenseTensor qkv_buf;
+  phi::DenseTensor fmha_buf;
+  VLOG(3) << "fmha_out " << fmha_out->dims();
+  if (out_scale <= 0) {
+    dev_ctx.template Alloc<T>(fmha_out);
+    fmha_buf = *fmha_out;
+  } else {
+    PADDLE_THROW(phi::errors::Unimplemented("Not supports out_scale > 0."));
+  }
+  int r = xpu::constant<XPUType>(xpu_context,
+                                 reinterpret_cast<XPUType*>(fmha_buf.data<T>()),
+                                 fmha_buf.numel(),
+                                 0);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
+  const auto& input_dims = qkv.dims();
+  const auto& key_cache_dims = key_cache.dims();
+  const int token_num = input_dims[0];
+  const int num_head = key_cache_dims[1];
+  const int dim_head = key_cache_dims[3];
+  const int bsz = cum_offsets.dims()[0];
+  const int max_block_per_seq = block_tables.dims()[1];
+  VLOG(3) << "bsz: " << bsz << " token_num: " << token_num
+          << " num_head: " << num_head << " dim_head: " << dim_head
+          << " max_block_per_seq: " << max_block_per_seq;
+  VLOG(3) << "fmha_out_dims: " << fmha_out->dims();
+  bool causual = true;
+  if (mask) {
+    causual = false;
+  }
+  bool use_pre_cache = false;
+  int pre_cache_length = 0;
+  if (pre_key_cache) {
+    PADDLE_THROW(phi::errors::Unimplemented("Not supports pre_key_cache now."));
+  }
+  VLOG(3) << "token_num: " << token_num
+          << " pre_cache_length: " << pre_cache_length;
+
+  int max_dec_len_this_time_data(0);
+  if (!max_dec_len_this_time) {
+    phi::DenseTensor max_dec_len_tensor;
+    max_dec_len_tensor.Resize({{1}});
+    dev_ctx.template Alloc<int>(&max_dec_len_tensor,
+                                max_dec_len_tensor.numel() * sizeof(int));
+    max_dec_len_this_time_data =
+        GetMaxLen(dev_ctx, seq_lens_decoder, &max_dec_len_tensor, bsz);
+  } else {
+    PADDLE_ENFORCE_EQ(
+        max_dec_len_this_time.get().place().GetType(),
+        phi::AllocationType::CPU,
+        errors::InvalidArgument(
+            "The place of input max_dec_len_this_time must be CPU, but got %s.",
+            max_dec_len_this_time.get().place()));
+    max_dec_len_this_time_data = *max_dec_len_this_time.get().data<int>();
+  }
+  int max_enc_len_this_time_data(0);
+  if (!max_enc_len_this_time) {
+    phi::DenseTensor max_enc_len_tensor;
+    max_enc_len_tensor.Resize({{1}});
+    dev_ctx.template Alloc<int>(&max_enc_len_tensor,
+                                max_enc_len_tensor.numel() * sizeof(int));
+    max_enc_len_this_time_data =
+        GetMaxLen(dev_ctx, seq_lens_encoder, &max_enc_len_tensor, bsz);
+  } else {
+    PADDLE_ENFORCE_EQ(
+        max_enc_len_this_time.get().place().GetType(),
+        phi::AllocationType::CPU,
+        errors::InvalidArgument(
+            "The place of input max_enc_len_this_time must be CPU, but got %s.",
+            max_enc_len_this_time.get().place()));
+    max_enc_len_this_time_data = *max_enc_len_this_time.get().data<int>();
+  }
+
+  const int MAXPTR_N = xpu_context->max_ptr_size();
+  VLOG(3) << "max_len end";
+  phi::DenseTensor unpadding_q, unpadding_k, unpadding_v;
+  phi::DenseTensor softmax_out, softmax_lse, seed_offset;
+  phi::DenseTensor q_trans, k_trans, v_trans, qktv_out;
+  if (!use_pre_cache) {
+    unpadding_q.Resize({{token_num, num_head, dim_head}});
+    unpadding_k.Resize({{token_num, num_head, dim_head}});
+    unpadding_v.Resize({{token_num, num_head, dim_head}});
+
+    dev_ctx.template Alloc<T>(&unpadding_q, unpadding_q.numel() * sizeof(T));
+    dev_ctx.template Alloc<T>(&unpadding_k, unpadding_k.numel() * sizeof(T));
+    dev_ctx.template Alloc<T>(&unpadding_v, unpadding_v.numel() * sizeof(T));
+  } else {
+    PADDLE_THROW(phi::errors::Unimplemented("Not supports pre_key_cache now."));
+  }
+  VLOG(3) << "encoder";
+  VLOG(3) << "max_enc_len_this_time_data: " << max_enc_len_this_time_data;
+  if (qkv_out_scale) {
+    PADDLE_THROW(phi::errors::Unimplemented("Not supports qkv_out_scale now."));
+  } else {
+    VLOG(1) << "qkv_out_scale is none";
+    qkv_buf = qkv;
+  }
+  if (qkv_bias) {
+    PADDLE_THROW(phi::errors::Unimplemented("Not supports qkv_bias now."));
+  }
+  std::vector<int> lods_cpu(bsz + 1, 0);
+  xpu_wait(xpu_context->xpu_stream);
+  xpu_memcpy(lods_cpu.data() + 1,
+             seq_lens_this_time.data<int>(),
+             sizeof(int32_t) * bsz,
+             XPUMemcpyKind::XPU_DEVICE_TO_HOST);
+  for (int i = 1; i < bsz + 1; i++) {
+    lods_cpu[i] += lods_cpu[i - 1];
+  }
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  baidu::xpu::api::VectorParam<int32_t> lods =
+      baidu::xpu::api::VectorParam<int32_t>{lods_cpu.data(), bsz + 1, nullptr}
+          .to_xpu(RAII_GUARD);
+  float* p_batch_max_ptrs = RAII_GUARD.alloc_l3_or_gm<float>(bsz);
+
+  if (!rope_emb || !use_neox_style) {
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "only supports use_neox_style rope_emb now."));
+  }
+  if (max_enc_len_this_time_data > 0) {
+    // const int* sequence_lengths_data = seq_lens_encoder.data<int>();
+    qkv_split_rope_kernel<T, Context>(dev_ctx,
+                                      qkv,
+                                      rope_emb.get(),
+                                      seq_lens_encoder,
+                                      lods,
+                                      bsz,
+                                      rope_emb.get().dims()[2],
+                                      token_num,
+                                      num_head,
+                                      dim_head,
+                                      &unpadding_q,
+                                      &unpadding_k,
+                                      &unpadding_v);
+
+    VLOG(3) << "rope end";
+    VLOG(3) << "causual: " << causual;
+    if (!use_pre_cache) {
+      phi::FlashAttnUnpaddedKernel<T>(dev_ctx,
+                                      unpadding_q,
+                                      unpadding_k,
+                                      unpadding_v,
+                                      cu_seqlens_q,
+                                      cu_seqlens_k,
+                                      paddle::none /*fixed_seed_offset*/,
+                                      causual ? paddle::none : mask,
+                                      max_enc_len_this_time_data,
+                                      max_enc_len_this_time_data,
+                                      1.0f / sqrt(static_cast<float>(dim_head)),
+                                      0.0,
+                                      causual,
+                                      false,
+                                      true /* is_test*/,
+                                      "" /*rng_name*/,
+                                      &fmha_buf,
+                                      &softmax_out,
+                                      &softmax_lse,
+                                      &seed_offset);
+    } else {
+      PADDLE_THROW(
+          phi::errors::Unimplemented("Not supports use_pre_cache now."));
+    }
+    VLOG(3) << "flash end";
+    if (cache_k_quant_scales && dynamic_cachekv_quant) {
+      PADDLE_THROW(phi::errors::Unimplemented("Not supports quant now."));
+    } else {
+      std::vector<int32_t> start_token_ctx(bsz, 0);
+      xpu::VectorParam<int32_t> start_token_ctx_VP =
+          xpu::VectorParam<int32_t>{
+              start_token_ctx.data(),
+              static_cast<int64_t>(start_token_ctx.size()),
+              nullptr}
+              .to_xpu(RAII_GUARD);
+
+      std::vector<int32_t> ordered_index_ctx(bsz, 0);
+      std::iota(ordered_index_ctx.begin(), ordered_index_ctx.end(), 0);
+      xpu::VectorParam<int32_t> ordered_index_ctx_VP =
+          xpu::VectorParam<int32_t>{
+              ordered_index_ctx.data(), static_cast<int64_t>(bsz), nullptr}
+              .to_xpu(RAII_GUARD);
+      int ret = xpu::reshape_cached_kv<XPUType, XPUType, int32_t>(
+          xpu_context,
+          reinterpret_cast<const XPUType*>(unpadding_k.data<T>()),
+          reinterpret_cast<XPUType*>(const_cast<T*>(key_cache.data<T>())),
+          block_tables.data<int>(),
+          lods,
+          start_token_ctx_VP,
+          ordered_index_ctx_VP,
+          bsz,
+          num_head,
+          dim_head,
+          bsz,
+          block_size,
+          max_block_per_seq,
+          "BLHD",
+          "HLD");
+      PADDLE_ENFORCE_XDNN_SUCCESS(ret, "reshape_cached_kv");
+      ret = xpu::batch_findmax<XPUType>(
+          xpu_context,
+          reinterpret_cast<XPUType*>(const_cast<T*>(key_cache.data<T>())),
+          token_num,
+          num_head * dim_head,
+          bsz,
+          lods.xpu,
+          p_batch_max_ptrs);
+      PADDLE_ENFORCE_XDNN_SUCCESS(ret, "batch_findmax");
+      ret = xpu::copy2d<float>(
+          xpu_context,
+          p_batch_max_ptrs,
+          const_cast<float*>(cache_k_per_batch_maxs.data<float>()),
+          bsz,
+          1,
+          MAXPTR_N,
+          1);
+      PADDLE_ENFORCE_XDNN_SUCCESS(ret, "copy2d");
+      ret = xpu::reshape_cached_kv<XPUType, XPUType, int32_t>(
+          xpu_context,
+          reinterpret_cast<const XPUType*>(unpadding_v.data<T>()),
+          reinterpret_cast<XPUType*>(const_cast<T*>(value_cache.data<T>())),
+          block_tables.data<int>(),
+          lods,
+          start_token_ctx_VP,
+          ordered_index_ctx_VP,
+          bsz,
+          num_head,
+          dim_head,
+          bsz,
+          block_size,
+          max_block_per_seq,
+          "BLHD",
+          "HLD");
+      PADDLE_ENFORCE_XDNN_SUCCESS(ret, "reshape_cached_kv");
+      ret = xpu::batch_findmax<XPUType>(
+          xpu_context,
+          reinterpret_cast<XPUType*>(const_cast<T*>(value_cache.data<T>())),
+          token_num,
+          num_head * dim_head,
+          bsz,
+          lods.xpu,
+          p_batch_max_ptrs);
+      PADDLE_ENFORCE_XDNN_SUCCESS(ret, "batch_findmax");
+      ret = xpu::copy2d<float>(
+          xpu_context,
+          p_batch_max_ptrs,
+          const_cast<float*>(cache_v_per_batch_maxs.data<float>()),
+          bsz,
+          1,
+          MAXPTR_N,
+          1);
+      PADDLE_ENFORCE_XDNN_SUCCESS(ret, "copy2d");
+    }
+    VLOG(3) << "cache end";
+  }
+  VLOG(3) << "encoder done";
+  VLOG(3) << "max_dec_len_this_time_data: " << max_dec_len_this_time_data;
+
+  if (max_dec_len_this_time_data > 0) {
+    int cachekv_quant_mode = 0;
+    if (cache_k_quant_scales || cachekv_quant_mode) {
+      PADDLE_THROW(phi::errors::Unimplemented(
+          "Not supports cache_k_quant_scales or cachekv_quant_mode now."));
+    }
+
+    qkv_split_rope_kernel<T, Context>(dev_ctx,
+                                      qkv,
+                                      rope_emb.get(),
+                                      seq_lens_encoder,
+                                      lods,
+                                      bsz,
+                                      rope_emb.get().dims()[2],
+                                      token_num,
+                                      num_head,
+                                      dim_head,
+                                      &unpadding_q,
+                                      &unpadding_k,
+                                      &unpadding_v);
+
+    std::vector<int32_t> kv_seq_lod_dec(bsz + 1, 0);
+    std::iota(kv_seq_lod_dec.begin(), kv_seq_lod_dec.end(), 0);
+    xpu::VectorParam<int32_t> kv_seq_lod_dec_VP =
+        xpu::VectorParam<int32_t>{kv_seq_lod_dec.data(),
+                                  static_cast<int64_t>(kv_seq_lod_dec.size()),
+                                  nullptr}
+            .to_xpu(RAII_GUARD);
+    std::vector<int32_t> start_token_ctx(bsz, 0);
+    for (int i = 0; i < bsz; i++) {
+      start_token_ctx[i] = lods_cpu[i + 1] - lods_cpu[i];
+    }
+    xpu::VectorParam<int32_t> start_token_ctx_VP =
+        xpu::VectorParam<int32_t>{start_token_ctx.data(),
+                                  static_cast<int64_t>(start_token_ctx.size()),
+                                  nullptr}
+            .to_xpu(RAII_GUARD);
+
+    std::vector<int32_t> ordered_index_ctx(bsz, 0);
+    std::iota(ordered_index_ctx.begin(), ordered_index_ctx.end(), 0);
+    xpu::VectorParam<int32_t> ordered_index_ctx_VP =
+        xpu::VectorParam<int32_t>{
+            ordered_index_ctx.data(), static_cast<int64_t>(bsz), nullptr}
+            .to_xpu(RAII_GUARD);
+
+    float* p_batch_max_ptrs_fill =
+        RAII_GUARD.alloc_l3_or_gm<float>(bsz * MAXPTR_N);
+    int ret = xpu::constant<float>(
+        xpu_context, p_batch_max_ptrs_fill, bsz * MAXPTR_N, 0.0);
+    PADDLE_ENFORCE_XDNN_SUCCESS(ret, "constant");
+    float* p_cache_k_max_data = RAII_GUARD.alloc_l3_or_gm<float>(MAXPTR_N);
+    float* p_cache_v_max_data = RAII_GUARD.alloc_l3_or_gm<float>(MAXPTR_N);
+    ret = xpu::reshape_cached_kv<XPUType, XPUType, int32_t>(
+        xpu_context,
+        reinterpret_cast<const XPUType*>(unpadding_k.data<T>()),
+        reinterpret_cast<XPUType*>(const_cast<T*>(key_cache.data<T>())),
+        block_tables.data<int>(),
+        kv_seq_lod_dec_VP,
+        start_token_ctx_VP,
+        ordered_index_ctx_VP,
+        bsz,
+        num_head,
+        dim_head,
+        bsz,
+        block_size,
+        max_block_per_seq,
+        "BLHD",
+        "HLD");
+    PADDLE_ENFORCE_XDNN_SUCCESS(ret, "reshape_cached_kv");
+    ret = xpu::batch_findmax<XPUType>(
+        xpu_context,
+        reinterpret_cast<XPUType*>(unpadding_k.data<T>()),
+        bsz,
+        num_head * dim_head,
+        p_batch_max_ptrs);
+    PADDLE_ENFORCE_XDNN_SUCCESS(ret, "batch_findmax");
+    unpadding_k.clear();
+    ret = xpu::copy2d<float>(xpu_context,
+                             p_batch_max_ptrs,
+                             p_batch_max_ptrs_fill,
+                             bsz,
+                             1,
+                             MAXPTR_N,
+                             1);
+    PADDLE_ENFORCE_XDNN_SUCCESS(ret, "copy2d");
+    ret = xpu::max<float>(
+        xpu_context,
+        cache_k_per_batch_maxs.data<float>(),
+        p_batch_max_ptrs_fill,
+        const_cast<float*>(cache_k_per_batch_maxs.data<float>()),
+        bsz * MAXPTR_N);
+    PADDLE_ENFORCE_XDNN_SUCCESS(ret, "max");
+    ret = xpu::findmax<float>(
+        xpu_context,
+        const_cast<float*>(cache_k_per_batch_maxs.data<float>()),
+        p_cache_k_max_data,
+        bsz * MAXPTR_N);
+    PADDLE_ENFORCE_XDNN_SUCCESS(ret, "findmax");
+    ret = xpu::reshape_cached_kv<XPUType, XPUType, int32_t>(
+        xpu_context,
+        reinterpret_cast<const XPUType*>(unpadding_v.data<T>()),
+        reinterpret_cast<XPUType*>(const_cast<T*>(value_cache.data<T>())),
+        block_tables.data<int>(),
+        kv_seq_lod_dec_VP,
+        start_token_ctx_VP,
+        ordered_index_ctx_VP,
+        bsz,
+        num_head,
+        dim_head,
+        bsz,
+        block_size,
+        max_block_per_seq,
+        "BLHD",
+        "HLD");
+    PADDLE_ENFORCE_XDNN_SUCCESS(ret, "reshape_cached_kv");
+    ret = xpu::batch_findmax<XPUType>(
+        xpu_context,
+        reinterpret_cast<XPUType*>(unpadding_v.data<T>()),
+        bsz,
+        num_head * dim_head,
+        p_batch_max_ptrs);
+    PADDLE_ENFORCE_XDNN_SUCCESS(ret, "batch_findmax");
+    unpadding_v.clear();
+    ret = xpu::copy2d<float>(xpu_context,
+                             p_batch_max_ptrs,
+                             p_batch_max_ptrs_fill,
+                             bsz,
+                             1,
+                             MAXPTR_N,
+                             1);
+    PADDLE_ENFORCE_XDNN_SUCCESS(ret, "copy2d");
+    ret = xpu::max<float>(
+        xpu_context,
+        cache_v_per_batch_maxs.data<float>(),
+        p_batch_max_ptrs_fill,
+        const_cast<float*>(cache_v_per_batch_maxs.data<float>()),
+        bsz * MAXPTR_N);
+    PADDLE_ENFORCE_XDNN_SUCCESS(ret, "max");
+    ret = xpu::findmax<float>(
+        xpu_context,
+        const_cast<float*>(cache_v_per_batch_maxs.data<float>()),
+        p_cache_v_max_data,
+        bsz * MAXPTR_N);
+    PADDLE_ENFORCE_XDNN_SUCCESS(ret, "findmax");
+
+    VLOG(1) << "cachekv_quant_mode " << cachekv_quant_mode;
+    std::vector<int> lods_decoder_cpu(bsz + 1, 0);
+    xpu_wait(xpu_context->xpu_stream);
+    xpu_memcpy(lods_decoder_cpu.data() + 1,
+               seq_lens_decoder.data<int>(),
+               sizeof(int32_t) * bsz,
+               XPUMemcpyKind::XPU_DEVICE_TO_HOST);
+    for (int i = 1; i < bsz + 1; i++) {
+      lods_decoder_cpu[i] += lods_decoder_cpu[i - 1];
+    }
+    std::vector<int32_t> qkvlod_dec(2 * (bsz + 1), 0);
+    for (int bs = 0; bs < bsz; bs++) {
+      qkvlod_dec[bs + 1] = bs + 1;
+      qkvlod_dec[bsz + 1 + bs + 1] = lods_decoder_cpu[bs + 1] + 1;
+    }
+    auto qkvlod_dec_vp =
+        xpu::VectorParam<int32_t>{
+            qkvlod_dec.data(), static_cast<int64_t>(qkvlod_dec.size()), nullptr}
+            .to_xpu(RAII_GUARD);
+    xpu::DecodeAttnParam decoder_attn_vsl_param(
+        qkvlod_dec_vp, max_seq_len, num_head, dim_head, -1, 0, bsz, {});
+    xpu::PageAttnParam<int> page_param(
+        block_size, bsz, max_block_per_seq, ordered_index_ctx_VP, 0, "HLD");
+    float* max_q_ptr = RAII_GUARD.alloc_l3_or_gm<float>(MAXPTR_N);
+    ret = xpu::findmax<XPUType>(xpu_context,
+                                reinterpret_cast<XPUType*>(unpadding_q.data()),
+                                max_q_ptr,
+                                token_num * num_head * dim_head);
+
+    ret = xpu::qkv_paged_attention<XPUType,
+                                   XPUType,
+                                   XPUType,
+                                   XPUType,
+                                   int16_t,
+                                   float,
+                                   int>(
+        xpu_context,
+        reinterpret_cast<XPUType*>(unpadding_q.data()),
+        reinterpret_cast<XPUType*>(const_cast<T*>(key_cache.data<T>())),
+        reinterpret_cast<XPUType*>(const_cast<T*>(value_cache.data<T>())),
+        block_tables.data<int>(),  // [pagep.max_batch_size,
+                                   // pagep.max_num_blocks_per_seq]
+        reinterpret_cast<XPUType*>(fmha_buf.data<T>()),
+        max_q_ptr,
+        p_cache_k_max_data,  // shape=[6], nullptr if pagep.quant_type == 1
+        p_cache_v_max_data,  // shape=[6], nullptr if pagep.quant_type == 1
+        nullptr,
+        decoder_attn_vsl_param,  // attention 相关参数
+        page_param);             // page attention 相关参数
+    PADDLE_ENFORCE_XDNN_SUCCESS(ret, "qkv_paged_attention");
+  }
+  VLOG(3) << "decoder done";
+}
+}  // namespace fusion
+}  // namespace phi
+
+PD_REGISTER_KERNEL(block_multihead_attention_xpu,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::fusion::BlockMultiheadAttentionXPUKernel,
+                   phi::dtype::float16) {
+  kernel->InputAt(26).SetBackend(phi::Backend::CPU);
+  kernel->InputAt(27).SetBackend(phi::Backend::CPU);
+}
diff --git a/paddle/phi/kernels/fusion/xpu/fused_layernorm_kernel.cc b/paddle/phi/kernels/fusion/xpu/fused_layernorm_kernel.cc
index 833caa6688787..cac0182feaa2b 100644
--- a/paddle/phi/kernels/fusion/xpu/fused_layernorm_kernel.cc
+++ b/paddle/phi/kernels/fusion/xpu/fused_layernorm_kernel.cc
@@ -63,6 +63,11 @@ void FusedLayerNormKernel(const Context& dev_ctx,
 
   dev_ctx.template Alloc<float>(&residual_alpha_tmp);
   dev_ctx.template Alloc<T>(&residual_alpha_ptr);
+  r = baidu::xpu::api::constant(xpu_ctx->x_context(),
+                                reinterpret_cast<XPUType*>(out->data<T>()),
+                                out->numel(),
+                                static_cast<XPUType>(0.f));
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
 
   r = baidu::xpu::api::constant(xpu_ctx->x_context(),
                                 residual_alpha_tmp.data<float>(),
diff --git a/paddle/phi/kernels/gpu/group_norm_kernel.cu b/paddle/phi/kernels/gpu/group_norm_kernel.cu
index 4835b643efcc7..720447ea41a0e 100644
--- a/paddle/phi/kernels/gpu/group_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/group_norm_kernel.cu
@@ -123,6 +123,17 @@ inline __device__ void UpdateSum(const T* srcX, float* sum, float* sumSq) {
   *sumSq += src_data * src_data;
 }
 
+template <typename T, int THREADS_PER_CHANNEL>
+inline __device__ void UpdateSum(const T* srcX,
+                                 const T* srcR,
+                                 float* sum,
+                                 float* sumSq) {
+  float src_data = phi::__2float<T>(*srcX);
+  float srcy_data = phi::__2float<T>(*srcR);
+  *sum += src_data + srcy_data;
+  *sumSq += (src_data + srcy_data) * (src_data + srcy_data);
+}
+
 template <>
 inline __device__ void UpdateSum<__half, 2>(const __half* srcX,
                                             float* sum,
@@ -133,6 +144,20 @@ inline __device__ void UpdateSum<__half, 2>(const __half* srcX,
   *sumSq += f2.x * f2.x + f2.y * f2.y;
 }
 
+template <>
+inline __device__ void UpdateSum<__half, 2>(const __half* srcX,
+                                            const __half* srcR,
+                                            float* sum,
+                                            float* sumSq) {
+  __half2 h2 = *reinterpret_cast<__half2 const*>(srcX);
+  __half2 h2_r = *reinterpret_cast<__half2 const*>(srcR);
+  float2 f2 = __half22float2(h2);
+  float2 f2_r = __half22float2(h2_r);
+  *sum += f2.x + f2_r.x + f2.y + f2_r.y;
+  *sumSq +=
+      (f2.x + f2_r.x) * (f2.x + f2_r.x) + (f2.y + f2_r.y) * (f2.y + f2_r.y);
+}
+
 template <>
 inline __device__ void UpdateSum<phi::dtype::float16, 2>(
     const phi::dtype::float16* srcX, float* sum, float* sumSq) {
@@ -142,6 +167,21 @@ inline __device__ void UpdateSum<phi::dtype::float16, 2>(
   *sumSq += f2.x * f2.x + f2.y * f2.y;
 }
 
+template <>
+inline __device__ void UpdateSum<phi::dtype::float16, 2>(
+    const phi::dtype::float16* srcX,
+    const phi::dtype::float16* srcR,
+    float* sum,
+    float* sumSq) {
+  __half2 h2 = *reinterpret_cast<__half2 const*>(srcX);
+  __half2 h2_r = *reinterpret_cast<__half2 const*>(srcR);
+  float2 f2 = __half22float2(h2);
+  float2 f2_r = __half22float2(h2_r);
+  *sum += f2.x + f2_r.x + f2.y + f2_r.y;
+  *sumSq +=
+      (f2.x + f2_r.x) * (f2.x + f2_r.x) + (f2.y + f2_r.y) * (f2.y + f2_r.y);
+}
+
 #ifdef PADDLE_CUDA_BF16
 template <>
 inline __device__ void UpdateSum<phi::dtype::bfloat16, 2>(
@@ -151,6 +191,21 @@ inline __device__ void UpdateSum<phi::dtype::bfloat16, 2>(
   *sum += f2.x + f2.y;
   *sumSq += f2.x * f2.x + f2.y * f2.y;
 }
+
+template <>
+inline __device__ void UpdateSum<phi::dtype::bfloat16, 2>(
+    const phi::dtype::bfloat16* srcX,
+    const phi::dtype::bfloat16* srcR,
+    float* sum,
+    float* sumSq) {
+  __nv_bfloat162 h2 = *reinterpret_cast<__nv_bfloat162 const*>(srcX);
+  __nv_bfloat162 h2_r = *reinterpret_cast<__nv_bfloat162 const*>(srcR);
+  float2 f2 = phi::bfloat1622float2(h2);
+  float2 f2_r = phi::bfloat1622float2(h2_r);
+  *sum += f2.x + f2_r.x + f2.y + f2_r.y;
+  *sumSq +=
+      (f2.x + f2_r.x) * (f2.x + f2_r.x) + (f2.y + f2_r.y) * (f2.y + f2_r.y);
+}
 #endif
 
 template <typename T, int THREADS_PER_BLOCK>
@@ -177,7 +232,13 @@ __global__ void groupNormNDHWCSumSingerChannelKernel(
     int64_t offset = static_cast<int64_t>(ni) * params.dhwc +
                      static_cast<int64_t>(dhwi) * params.c + ci;
     float src_data = *reinterpret_cast<float const*>(&params.srcX[offset]);
-    UpdateSum<T, 1>(&params.srcX[offset], &sum, &sumSq);
+    if (params.srcR != nullptr) {
+      int64_t g_offset = params.y_same_with_x ? offset : ci;
+      UpdateSum<T, 1>(
+          &params.srcX[offset], &params.srcR[g_offset], &sum, &sumSq);
+    } else {
+      UpdateSum<T, 1>(&params.srcX[offset], &sum, &sumSq);
+    }
   }
 
   smem[threadIdx.x] = make_float2(sum, sumSq);
@@ -185,7 +246,6 @@ __global__ void groupNormNDHWCSumSingerChannelKernel(
   __syncthreads();
 
   float2 sums = smem[threadIdx.x];
-
   atomicAdd(&params.redBuffer[(2 * ni + 0) * params.groups + ci],
             sums.x * params.invDHWC);
   atomicAdd(&params.redBuffer[(2 * ni + 1) * params.groups + ci], sums.y);
@@ -209,7 +269,8 @@ __global__ void groupNormNDHWCSumKernel(const GroupNormNDHWCParams<T> params) {
   if (ci >= params.c || threadIdx.x * THREADS_PER_CHANNEL >= params.cPerBlock) {
     return;
   }
-  // The first activation loaded by that block.
+  int32_t gj = ci / params.cPerGroup;
+  int32_t cj = ci % params.cPerGroup;
   int32_t dhwBegin = blockIdx.y * params.dhwPerBlock;
   // The last activation loaded by that block.
   int32_t dhwEnd = min(dhwBegin + params.dhwPerBlock, params.dhw);
@@ -223,13 +284,19 @@ __global__ void groupNormNDHWCSumKernel(const GroupNormNDHWCParams<T> params) {
     int64_t offset = static_cast<int64_t>(ni) * params.dhwc +
                      static_cast<int64_t>(dhwi) * params.c + ci;
     float src_data = *reinterpret_cast<float const*>(&params.srcX[offset]);
-    UpdateSum<T, THREADS_PER_CHANNEL>(&params.srcX[offset], &sum, &sumSq);
+    if (params.srcR != nullptr) {
+      int64_t g_offset =
+          params.y_same_with_x ? offset : gj * params.cPerGroup + cj;
+      UpdateSum<T, THREADS_PER_CHANNEL>(
+          &params.srcX[offset], &params.srcR[g_offset], &sum, &sumSq);
+    } else {
+      UpdateSum<T, THREADS_PER_CHANNEL>(&params.srcX[offset], &sum, &sumSq);
+    }
   }
 
   // The group that thread works on and the channel in the group (modulus).
   int32_t gi =
       ci / params.cPerGroup - blockIdx.x * params.cPerBlock / params.cPerGroup;
-  int32_t cj = ci % params.cPerGroup;
   int flag = (cj == 0 || threadIdx.x == 0) ? 1 : 0;
   GroupSums inp{flag, sum, sumSq};
   GroupSums out;
@@ -243,7 +310,6 @@ __global__ void groupNormNDHWCSumKernel(const GroupNormNDHWCParams<T> params) {
 
   __syncthreads();
 
-  int32_t gj = ci / params.cPerGroup;
   if (cj == params.cPerGroup - THREADS_PER_CHANNEL ||
       threadIdx.x * THREADS_PER_CHANNEL ==
           params.cPerBlock - THREADS_PER_CHANNEL) {
@@ -351,7 +417,15 @@ inline __device__ void GroupNormCompute(int32_t dhwBegin,
   for (int32_t dhwi = dhwBegin; dhwi < dhwEnd; ++dhwi) {
     // The src/dst offset.
     int64_t offset = (int64_t)blockIdx.z * params.dhwc + dhwi * params.c + ci;
-    const float src_data = phi::__2float<T>(params.srcX[offset]);
+    float src_data = phi::__2float<T>(params.srcX[offset]);
+    if (params.srcR != nullptr) {
+      auto gi = ci / params.cPerGroup;
+      auto gj = ci % params.cPerGroup;
+      int64_t g_offset =
+          params.y_same_with_x ? offset : gi * params.cPerGroup + gj;
+      src_data += phi::__2float<T>(params.srcR[g_offset]);
+      *reinterpret_cast<T*>(&params.eleOut[offset]) = phi::__2dst<T>(src_data);
+    }
     // Normalize the channels.
     float dst_data = (src_data - mean) * invStdDev;
     // Scale by gamma and add beta.
@@ -392,6 +466,18 @@ inline __device__ void GroupNormCompute<phi::dtype::float16, 2>(
     // Extract the two half values.
     float2 f2 = __half22float2(h2);
 
+    if (params.srcR != nullptr) {
+      auto gi = ci / params.cPerGroup;
+      auto gj = ci % params.cPerGroup;
+      int64_t g_offset =
+          params.y_same_with_x ? offset : gi * params.cPerGroup + gj;
+      __half2 r2 = *reinterpret_cast<__half2 const*>(&params.srcR[g_offset]);
+      float2 r_f2 = __half22float2(r2);
+      f2.x += r_f2.x;
+      f2.y += r_f2.y;
+      *reinterpret_cast<__half2*>(&params.eleOut[offset]) =
+          __float22half2_rn(f2);
+    }
     // Normalize the channels.
     f2.x = (f2.x - mean) * invStdDev;
     f2.y = (f2.y - mean) * invStdDev;
@@ -434,7 +520,18 @@ inline __device__ void GroupNormCompute<__half, 2>(
 
     // Extract the two half values.
     float2 f2 = __half22float2(h2);
-
+    if (params.srcR != nullptr) {
+      auto gi = ci / params.cPerGroup;
+      auto gj = ci % params.cPerGroup;
+      int64_t g_offset =
+          params.y_same_with_x ? offset : gi * params.cPerGroup + gj;
+      __half2 r2 = *reinterpret_cast<__half2 const*>(&params.srcR[g_offset]);
+      float2 r_f2 = __half22float2(r2);
+      f2.x += r_f2.x;
+      f2.y += r_f2.y;
+      *reinterpret_cast<__half2*>(&params.eleOut[offset]) =
+          __float22half2_rn(f2);
+    }
     // Normalize the channels.
     f2.x = (f2.x - mean) * invStdDev;
     f2.y = (f2.y - mean) * invStdDev;
@@ -480,6 +577,19 @@ inline __device__ void GroupNormCompute<phi::dtype::bfloat16, 2>(
     // Extract the two half values.
     float2 f2 = phi::bfloat1622float2(h2);
 
+    if (params.srcR != nullptr) {
+      auto gi = ci / params.cPerGroup;
+      auto gj = ci % params.cPerGroup;
+      int64_t g_offset =
+          params.y_same_with_x ? offset : gi * params.cPerGroup + gj;
+      __nv_bfloat162 r2 =
+          *reinterpret_cast<__nv_bfloat162 const*>(&params.srcR[g_offset]);
+      float2 r_f2 = phi::bfloat1622float2(r2);
+      f2.x += r_f2.x;
+      f2.y += r_f2.y;
+      *reinterpret_cast<__nv_bfloat162*>(&params.eleOut[offset]) =
+          phi::float22bfloat162_rn(f2);
+    }
     // Normalize the channels.
     f2.x = (f2.x - mean) * invStdDev;
     f2.y = (f2.y - mean) * invStdDev;
@@ -511,6 +621,7 @@ __global__ void groupNormNDHWCScaleKernel(
 
   // The group that thread works on and the channel in the group (modulus).
   int32_t gi = ci / params.cPerGroup;
+  int32_t gj = ci % params.cPerGroup;
 
   if (ci >= params.c || gi >= params.groups) {
     return;
@@ -597,17 +708,24 @@ template class groupNormNDHWCScale<half>;
 template <typename T, typename Context>
 void GroupNormNDHWCKernel(const Context& dev_ctx,
                           const DenseTensor& x,
+                          const paddle::optional<DenseTensor>& residual,
                           const paddle::optional<DenseTensor>& scale,
                           const paddle::optional<DenseTensor>& bias,
                           float epsilon,
                           int groups,
                           const std::string& data_layout_str,
+                          const std::string& activation,
                           DenseTensor* y,
+                          DenseTensor* residual_out,
                           DenseTensor* mean,
                           DenseTensor* var) {
+  const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
+  if (data_layout != DataLayout::kNHWC) {
+    PD_THROW("data_layout only supports NHWC and NDHWC");
+  }
   using AccT = typename phi::dtype::MPTypeTrait<T>::Type;
   GroupNormNDHWCParams<T> params_;
-  params_.withSilu = false;
+  params_.withSilu = activation == "silu" ? true : false;
 
   const auto x_dims = x.dims();
   dev_ctx.template Alloc<T>(y);
@@ -639,6 +757,23 @@ void GroupNormNDHWCKernel(const Context& dev_ctx,
     params_.w = x_dims[3];
   }
 
+  const T* residual_data = nullptr;
+  const auto residual_ptr = residual.get_ptr();
+  T* residual_out_data = nullptr;
+  if (residual_ptr) {
+    dev_ctx.template Alloc<T>(residual_out);
+    residual_data = residual_ptr->data<T>();
+    residual_out_data = residual_out->data<T>();
+    const auto r_dims = residual_ptr->dims();
+    int32_t r_dim = 1;
+    for (size_t i = 0; i < r_dims.size(); i++) {
+      r_dim *= r_dims[i];
+    }
+    params_.y_same_with_x =
+        r_dim == params_.n * params_.c * params_.d * params_.h * params_.w
+            ? true
+            : false;
+  }
   dev_ctx.template Alloc<AccT>(mean);
   dev_ctx.template Alloc<AccT>(var);
   auto* mean_data = mean->data<AccT>();
@@ -673,7 +808,10 @@ void GroupNormNDHWCKernel(const Context& dev_ctx,
   }
   params_.srcX = reinterpret_cast<const T*>(x_data);
   params_.dst = reinterpret_cast<T*>(y_data);
-
+  if (residual_ptr) {
+    params_.srcR = reinterpret_cast<const T*>(residual_data);
+    params_.eleOut = reinterpret_cast<T*>(residual_out_data);
+  }
   params_.gamma = scale_data;
   params_.beta = bias_data;
   params_.dhw = params_.d * params_.h * params_.w;
@@ -1027,14 +1165,19 @@ void GroupNormKernel(const Context& dev_ctx,
                      DenseTensor* var) {
   using std::is_same;
   if (is_same<T, phi::dtype::float16>::value && data_layout_str == "NHWC") {
+    const paddle::optional<DenseTensor>& residual =
+        paddle::optional<DenseTensor>(paddle::none);
     GroupNormNDHWCKernel<phi::dtype::float16, Context>(dev_ctx,
                                                        x,
+                                                       residual,
                                                        scale,
                                                        bias,
                                                        epsilon,
                                                        groups,
                                                        data_layout_str,
+                                                       "",
                                                        y,
+                                                       new DenseTensor(),
                                                        mean,
                                                        var);
     return;
@@ -1042,14 +1185,19 @@ void GroupNormKernel(const Context& dev_ctx,
 
 #ifdef PADDLE_CUDA_BF16
   if (is_same<T, phi::dtype::bfloat16>::value && data_layout_str == "NHWC") {
+    const paddle::optional<DenseTensor>& residual =
+        paddle::optional<DenseTensor>(paddle::none);
     GroupNormNDHWCKernel<phi::dtype::bfloat16, Context>(dev_ctx,
                                                         x,
+                                                        residual,
                                                         scale,
                                                         bias,
                                                         epsilon,
                                                         groups,
                                                         data_layout_str,
+                                                        "",
                                                         y,
+                                                        new DenseTensor(),
                                                         mean,
                                                         var);
     return;
@@ -1076,3 +1224,13 @@ PD_REGISTER_KERNEL(group_norm,
     kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
   }
 }
+
+PD_REGISTER_KERNEL(add_group_norm_silu,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GroupNormNDHWCKernel,
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16) {
+  kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
+}
diff --git a/paddle/phi/kernels/gpu/inverse_grad_kernel.cu b/paddle/phi/kernels/gpu/inverse_grad_kernel.cu
index 2fdc02934fedc..15c24719adfc3 100644
--- a/paddle/phi/kernels/gpu/inverse_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/inverse_grad_kernel.cu
@@ -18,5 +18,11 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/inverse_grad_kernel_impl.h"
 
-PD_REGISTER_KERNEL(
-    inverse_grad, GPU, ALL_LAYOUT, phi::InverseGradKernel, float, double) {}
+PD_REGISTER_KERNEL(inverse_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::InverseGradKernel,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/inverse_kernel.cu b/paddle/phi/kernels/gpu/inverse_kernel.cu
index 4c011337c6f8f..a9b4fcc763b0b 100644
--- a/paddle/phi/kernels/gpu/inverse_kernel.cu
+++ b/paddle/phi/kernels/gpu/inverse_kernel.cu
@@ -18,5 +18,11 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/inverse_kernel_impl.h"
 
-PD_REGISTER_KERNEL(
-    inverse, GPU, ALL_LAYOUT, phi::InverseKernel, float, double) {}
+PD_REGISTER_KERNEL(inverse,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::InverseKernel,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/lstm_grad_kernel.cu b/paddle/phi/kernels/gpu/lstm_grad_kernel.cu
new file mode 100644
index 0000000000000..5590541dcb385
--- /dev/null
+++ b/paddle/phi/kernels/gpu/lstm_grad_kernel.cu
@@ -0,0 +1,19 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/lstm_kernel_impl.h"
+#include "paddle/phi/kernels/lstm_kernel.h"
+
+PD_REGISTER_KERNEL(
+    lstm_grad, GPU, ALL_LAYOUT, phi::LSTMGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/lstm_kernel.cu b/paddle/phi/kernels/gpu/lstm_kernel.cu
new file mode 100644
index 0000000000000..7bcf1f78ab604
--- /dev/null
+++ b/paddle/phi/kernels/gpu/lstm_kernel.cu
@@ -0,0 +1,18 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/lstm_kernel.h"
+#include "paddle/phi/kernels/impl/lstm_kernel_impl.h"
+
+PD_REGISTER_KERNEL(lstm, GPU, ALL_LAYOUT, phi::LSTMKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/meshgrid_grad_kernel.cu.cc b/paddle/phi/kernels/gpu/meshgrid_grad_kernel.cu.cc
index 2dd9e7dc6ceec..3244f28c77700 100644
--- a/paddle/phi/kernels/gpu/meshgrid_grad_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/meshgrid_grad_kernel.cu.cc
@@ -27,4 +27,6 @@ PD_REGISTER_KERNEL(meshgrid_grad,
                    double,
                    int,
                    int64_t,
-                   phi::dtype::bfloat16) {}
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/meshgrid_kernel.cu.cc b/paddle/phi/kernels/gpu/meshgrid_kernel.cu.cc
index 5a1c74f4193d3..9176305d94fec 100644
--- a/paddle/phi/kernels/gpu/meshgrid_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/meshgrid_kernel.cu.cc
@@ -27,4 +27,6 @@ PD_REGISTER_KERNEL(meshgrid,
                    double,
                    int,
                    int64_t,
-                   phi::dtype::bfloat16) {}
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/group_norm_kernel.h b/paddle/phi/kernels/group_norm_kernel.h
index 3dc10df6a1109..7f4b83f065bde 100644
--- a/paddle/phi/kernels/group_norm_kernel.h
+++ b/paddle/phi/kernels/group_norm_kernel.h
@@ -67,6 +67,8 @@ struct GroupNormNDHWCParams {
   T const* srcX;
   // The input buffer. Layout NDHWC.
   T const* srcY;
+  // The input buffer. Layout NDHWC.
+  T const* srcR = nullptr;
   // The gamma scaling factor.
   void const* gamma;
   // The beta term to add in GN.
@@ -87,7 +89,8 @@ struct GroupNormNDHWCParams {
   int32_t groups;
   // Do we apply the Silu activation function?
   bool withSilu;
-
+  //
+  bool y_same_with_x = false;
   // Precomputed values and parameters to control the execution of the kernels.
 
   // The number of activations per instance (d * h * w) and the number of
diff --git a/paddle/phi/kernels/impl/inverse_grad_kernel_impl.h b/paddle/phi/kernels/impl/inverse_grad_kernel_impl.h
index 26e2898bf73ff..aa23bddb5b979 100644
--- a/paddle/phi/kernels/impl/inverse_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/inverse_grad_kernel_impl.h
@@ -18,6 +18,7 @@
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/kernels/complex_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/matrix_inverse.h"
 
@@ -37,15 +38,35 @@ void InverseGradKernel(const Context& dev_ctx,
     tmp_out.Resize(out.dims());
     dev_ctx.template Alloc<T>(&tmp_out);
 
-    auto mat_dim_a0 =
-        phi::funcs::CreateMatrixDescriptor(out_grad.dims(), 0, false);
-    auto mat_dim_b0 = phi::funcs::CreateMatrixDescriptor(out.dims(), 0, true);
-    blas.MatMul(out_grad, mat_dim_a0, out, mat_dim_b0, T(1), &tmp_out, T(0));
+    if (IsComplexType(out.dtype())) {
+      DenseTensor out_conj;
+      out_conj.Resize(out.dims());
+      dev_ctx.template Alloc<T>(&out_conj);
 
-    auto mat_dim_a1 = phi::funcs::CreateMatrixDescriptor(out.dims(), 0, true);
-    auto mat_dim_b1 =
-        phi::funcs::CreateMatrixDescriptor(tmp_out.dims(), 0, false);
-    blas.MatMul(out, mat_dim_a1, tmp_out, mat_dim_b1, T(-1), in_grad, T(0));
+      phi::ConjKernel<T, Context>(dev_ctx, out, &out_conj);
+
+      auto mat_dim_a0 =
+          phi::funcs::CreateMatrixDescriptor(out_grad.dims(), 0, false);
+      auto mat_dim_b0 = phi::funcs::CreateMatrixDescriptor(out.dims(), 0, true);
+      blas.MatMul(
+          out_grad, mat_dim_a0, out_conj, mat_dim_b0, T(1), &tmp_out, T(0));
+
+      auto mat_dim_a1 = phi::funcs::CreateMatrixDescriptor(out.dims(), 0, true);
+      auto mat_dim_b1 =
+          phi::funcs::CreateMatrixDescriptor(tmp_out.dims(), 0, false);
+      blas.MatMul(
+          out_conj, mat_dim_a1, tmp_out, mat_dim_b1, T(-1), in_grad, T(0));
+    } else {
+      auto mat_dim_a0 =
+          phi::funcs::CreateMatrixDescriptor(out_grad.dims(), 0, false);
+      auto mat_dim_b0 = phi::funcs::CreateMatrixDescriptor(out.dims(), 0, true);
+      blas.MatMul(out_grad, mat_dim_a0, out, mat_dim_b0, T(1), &tmp_out, T(0));
+
+      auto mat_dim_a1 = phi::funcs::CreateMatrixDescriptor(out.dims(), 0, true);
+      auto mat_dim_b1 =
+          phi::funcs::CreateMatrixDescriptor(tmp_out.dims(), 0, false);
+      blas.MatMul(out, mat_dim_a1, tmp_out, mat_dim_b1, T(-1), in_grad, T(0));
+    }
   }
 }
 
diff --git a/paddle/phi/kernels/impl/lstm_kernel_impl.h b/paddle/phi/kernels/impl/lstm_kernel_impl.h
new file mode 100644
index 0000000000000..1f4b4dcac0f14
--- /dev/null
+++ b/paddle/phi/kernels/impl/lstm_kernel_impl.h
@@ -0,0 +1,443 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/detail/activation_functions.h"
+#include "paddle/phi/kernels/funcs/lstm_compute.h"
+#include "paddle/phi/kernels/funcs/lstm_utils.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LSTMKernel(const Context& dev_ctx,
+                const DenseTensor& input,
+                const paddle::optional<DenseTensor>& h0,
+                const paddle::optional<DenseTensor>& c0,
+                const DenseTensor& weight,
+                const DenseTensor& bias,
+                bool use_peepholes,
+                bool is_reverse,
+                bool is_test,
+                const std::string& gate_activation,
+                const std::string& cell_activation,
+                const std::string& candidate_activation,
+                DenseTensor* hidden,
+                DenseTensor* cell,
+                DenseTensor* batch_gate,
+                DenseTensor* batch_cell_pre_act) {
+  auto* hidden_t0 = h0.get_ptr();
+  auto* cell_t0 = c0.get_ptr();
+
+  phi::DenseTensor* batch_gate_new = nullptr;
+  phi::DenseTensor batch_gate_temp;
+  if (is_test) {
+    batch_gate_new = &batch_gate_temp;
+    batch_gate_new->Resize(input.dims());
+  } else {
+    batch_gate_new = batch_gate;
+  }
+
+  dev_ctx.template Alloc<T>(batch_gate_new);
+  dev_ctx.template Alloc<T>(hidden);
+  dev_ctx.template Alloc<T>(cell);
+
+  phi::funcs::LoDTensor2BatchFunctor<Context, T> to_batch;
+  to_batch(dev_ctx, input, batch_gate_new, true, is_reverse);
+
+  auto in_dims = input.dims();
+  int frame_size = static_cast<int>(in_dims[1] / 4);
+  phi::DDim dims({in_dims[0], frame_size});
+
+  if (bias.initialized()) {
+    phi::DenseTensor b = bias;
+    b.Resize({bias.numel(), 1});
+    phi::DenseTensor gate_bias = b.Slice(0, 4 * frame_size);
+    phi::funcs::RowwiseAdd<Context, T> add_bias;
+    add_bias(dev_ctx, *batch_gate_new, gate_bias, batch_gate_new);
+  }
+
+  phi::funcs::LstmMetaValue<T> lstm_value;
+  if (bias.initialized() && use_peepholes) {
+    T* bias_data = const_cast<T*>(bias.data<T>());
+    // the code style in LstmMetaValue will be updated later.
+
+    lstm_value.check_ig = bias_data + 4 * frame_size;
+    lstm_value.check_fg = lstm_value.check_ig + frame_size;
+    lstm_value.check_og = lstm_value.check_fg + frame_size;
+  } else {
+    lstm_value.check_ig = nullptr;
+    lstm_value.check_fg = nullptr;
+    lstm_value.check_og = nullptr;
+  }
+  lstm_value.prev_state_value = nullptr;
+  phi::DenseTensor ordered_c0;
+
+  phi::Vector<size_t> order(batch_gate_new->lod()[2]);
+
+  if (cell_t0) {
+    // Since the batch computing for LSTM reorders the input sequence
+    // according to their length. The initialized cell state also needs
+    // to reorder.
+    ReorderInitState<Context, T>(dev_ctx, *cell_t0, order, &ordered_c0, true);
+    lstm_value.prev_state_value = ordered_c0.data<T>();
+  }
+
+  // Use the local variable as here.
+  phi::DenseTensor batch_hidden, batch_cell, batch_cell_pre_act_temp;
+  phi::DenseTensor* batch_cell_pre_act_p;
+  if (is_test) {
+    batch_cell_pre_act_p = &batch_cell_pre_act_temp;
+  } else {
+    batch_cell_pre_act_p = batch_cell_pre_act;
+  }
+  batch_hidden.Resize(dims);
+  batch_cell.Resize(dims);
+  dev_ctx.template Alloc<T>(&batch_hidden);
+  dev_ctx.template Alloc<T>(&batch_cell);
+  batch_cell_pre_act_p->Resize(dims);
+  dev_ctx.template Alloc<T>(batch_cell_pre_act_p);
+
+  auto batch_starts = batch_gate_new->lod()[0];
+  size_t num_batch = batch_starts.size() - 1;
+  auto gate_act = phi::funcs::detail::GetActivationType(gate_activation);
+  auto cell_act = phi::funcs::detail::GetActivationType(cell_activation);
+  auto cand_act = phi::funcs::detail::GetActivationType(candidate_activation);
+
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+  for (size_t n = 0; n < num_batch; n++) {
+    int bstart = static_cast<int>(batch_starts[n]);
+    int bend = static_cast<int>(batch_starts[n + 1]);
+
+    phi::DenseTensor gate_t = batch_gate_new->Slice(bstart, bend);
+    phi::DenseTensor out_t = batch_hidden.Slice(bstart, bend);
+    phi::DenseTensor cell_t = batch_cell.Slice(bstart, bend);
+    phi::DenseTensor cell_pre_act_t = batch_cell_pre_act_p->Slice(bstart, bend);
+
+    int cur_batch_size = bend - bstart;
+
+    if (n > 0) {
+      int pre_h_start = static_cast<int>(batch_starts[n - 1]);
+      int pre_h_end = pre_h_start + cur_batch_size;
+      auto pre_hidden_t = batch_hidden.Slice(pre_h_start, pre_h_end);
+      blas.MatMul(pre_hidden_t,
+                  false,
+                  weight,
+                  false,
+                  static_cast<T>(1.0),
+                  &gate_t,
+                  static_cast<T>(1.0));
+    } else if (hidden_t0 != nullptr) {
+      // If n == 0 and there is no initialized hidden state, that is to say
+      // the H0 is zeros, the calculation W_h * H0 will be skiped.
+      // If n == 0 and there is initialized hidden state, calculate W_h * H0.
+
+      // Since the batch computing for LSTM reorders the input sequence
+      // according to their length. The initialized hidden state also needs
+      // to reorder.
+      phi::DenseTensor ordered_h0;
+      ReorderInitState<Context, T>(
+          dev_ctx, *hidden_t0, order, &ordered_h0, true);
+      blas.MatMul(ordered_h0,
+                  false,
+                  weight,
+                  false,
+                  static_cast<T>(1.0),
+                  &gate_t,
+                  static_cast<T>(1.0));
+    }
+
+    lstm_value.gate_value = gate_t.data<T>();
+    lstm_value.output_value = out_t.data<T>();
+    lstm_value.state_value = cell_t.data<T>();
+    lstm_value.state_active_value = cell_pre_act_t.data<T>();
+    T cell_clip = 0.0;
+    phi::funcs::LstmUnitFunctor<Context, T>::compute(dev_ctx,
+                                                     lstm_value,
+                                                     frame_size,
+                                                     cur_batch_size,
+                                                     cell_clip,
+                                                     gate_act,
+                                                     cell_act,
+                                                     cand_act);
+    lstm_value.prev_state_value = lstm_value.state_value;
+  }
+
+  phi::funcs::Batch2LoDTensorFunctor<Context, T> to_seq;
+  batch_hidden.set_lod(batch_gate_new->lod());
+  // restore the output hidden in phi::DenseTensor from the batch hidden
+  to_seq(dev_ctx, batch_hidden, hidden);
+
+  batch_cell.set_lod(batch_gate_new->lod());
+  // restore the output cell state in phi::DenseTensor from the batch cell
+  to_seq(dev_ctx, batch_cell, cell);
+}
+
+template <typename T, typename Context>
+void LSTMGradKernel(const Context& dev_ctx,
+                    const DenseTensor& input_in,
+                    const paddle::optional<DenseTensor>& h0_in,
+                    const paddle::optional<DenseTensor>& c0_in,
+                    const DenseTensor& weight_in,
+                    const DenseTensor& bias_in,
+                    const DenseTensor& hidden_in,
+                    const DenseTensor& cell_in,
+                    const DenseTensor& batch_gate_in,
+                    const DenseTensor& batch_cell_pre_act_in,
+                    const DenseTensor& hidden_grad,
+                    bool use_peepholes,
+                    bool is_reverse,
+                    bool is_test,
+                    const std::string& gate_activation,
+                    const std::string& cell_activation,
+                    const std::string& candidate_activation,
+                    DenseTensor* input_grad,
+                    DenseTensor* h0_grad,
+                    DenseTensor* c0_grad,
+                    DenseTensor* weight_grad,
+                    DenseTensor* bias_grad) {
+  auto* input = &input_in;
+  auto* weight = &weight_in;
+  auto* bias = &bias_in;
+
+  auto* hidden_out = &hidden_in;
+  auto* cell_out = &cell_in;
+
+  auto* batch_gate = &batch_gate_in;
+  auto* batch_cell_pre_act = &batch_cell_pre_act_in;
+
+  auto* hidden_g = &hidden_grad;
+
+  auto* in_g = input_grad;
+  auto* weight_g = weight_grad;
+  auto* bias_g = bias_grad;
+
+  auto* h0 = h0_in.get_ptr();
+  auto* c0 = c0_in.get_ptr();
+
+  auto* h0_g = h0_grad;
+  auto* c0_g = c0_grad;
+
+  phi::funcs::SetConstant<Context, T> zero;
+  if (weight_g) {
+    dev_ctx.template Alloc<T>(weight_g);
+    zero(dev_ctx, weight_g, static_cast<T>(0.0));
+  }
+
+  // ordered_h0/c0 is the reordered hidden/cell initialization.
+  // ordered_h0_g/c0_g is the reordered gradient of hidden/cell
+  // initialization.
+  phi::DenseTensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g;
+  phi::Vector<size_t> order(batch_gate->lod()[2]);
+
+  if (c0) {
+    ReorderInitState<Context, T>(dev_ctx, *c0, order, &ordered_c0, true);
+  }
+  if (c0 && c0_g) {
+    ordered_c0_g.Resize(c0_g->dims());
+    dev_ctx.template Alloc<T>(&ordered_c0_g);
+  }
+
+  auto in_dims = input->dims();
+  auto out_dims = hidden_g->dims();
+  int frame_size = static_cast<int>(in_dims[1] / 4);
+  PADDLE_ENFORCE_EQ(frame_size,
+                    out_dims[1],
+                    phi::errors::InvalidArgument(
+                        "The second dimension of Input(hidden_grad) should be "
+                        "%d, but received %d in LSTM@Grad operator.",
+                        frame_size,
+                        out_dims[1]));
+
+  phi::funcs::LstmMetaValue<T> lstm_value;
+  if (bias && use_peepholes) {
+    T* bias_data = const_cast<T*>(bias->data<T>());
+    lstm_value.check_ig = bias_data + 4 * frame_size;
+    lstm_value.check_fg = lstm_value.check_ig + frame_size;
+    lstm_value.check_og = lstm_value.check_fg + frame_size;
+  } else {
+    lstm_value.check_ig = nullptr;
+    lstm_value.check_fg = nullptr;
+    lstm_value.check_og = nullptr;
+  }
+
+  phi::funcs::LstmMetaGrad<T> lstm_grad;
+
+  if (bias && bias_g) {
+    dev_ctx.template Alloc<T>(bias_g);
+    zero(dev_ctx, bias_g, static_cast<T>(0.0));
+  }
+  if (bias && bias_g && use_peepholes) {
+    T* bias_g_data = bias_g->data<T>();
+    lstm_grad.check_ig_grad = bias_g_data + 4 * frame_size;
+    lstm_grad.check_fg_grad = lstm_grad.check_ig_grad + frame_size;
+    lstm_grad.check_og_grad = lstm_grad.check_fg_grad + frame_size;
+  } else {
+    lstm_grad.check_ig_grad = nullptr;
+    lstm_grad.check_fg_grad = nullptr;
+    lstm_grad.check_og_grad = nullptr;
+  }
+
+  phi::funcs::LoDTensor2BatchFunctor<Context, T> to_batch;
+
+  auto ToBatch = [&batch_gate, &to_batch](const Context& ctx,
+                                          const phi::DenseTensor& src,
+                                          const phi::DDim& dims,
+                                          phi::DenseTensor& dst) {
+    dst.Resize(dims);
+    ctx.template Alloc<T>(&dst);
+    dst.set_lod(batch_gate->lod());
+    to_batch(ctx, src, &dst, false);
+  };
+
+  phi::DenseTensor batch_hidden, batch_hidden_g, batch_cell;
+  ToBatch(dev_ctx, *hidden_out, out_dims, batch_hidden);
+  ToBatch(dev_ctx, *hidden_g, out_dims, batch_hidden_g);
+  ToBatch(dev_ctx, *cell_out, out_dims, batch_cell);
+
+  phi::DenseTensor batch_cell_g, batch_gate_g;
+  batch_cell_g.Resize(out_dims);
+  dev_ctx.template Alloc<T>(&batch_cell_g);
+  // TODO(qingqing) support the case output cell has gradient.
+  // to_batch(dev_ctx, *cell_g, batch_cell_g, false);
+  zero(dev_ctx, &batch_cell_g, static_cast<T>(0.0));
+  batch_gate_g.Resize(batch_gate->dims());
+  dev_ctx.template Alloc<T>(&batch_gate_g);
+  batch_gate_g.set_lod(batch_gate->lod());
+
+  auto gate_act = phi::funcs::detail::GetActivationType(gate_activation);
+  auto cell_act = phi::funcs::detail::GetActivationType(cell_activation);
+  auto cand_act = phi::funcs::detail::GetActivationType(candidate_activation);
+
+  auto batch_starts = batch_gate->lod()[0];
+  size_t num_batch = batch_starts.size() - 1;
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+  for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) {
+    int bstart = static_cast<int>(batch_starts[n]);
+    int bend = static_cast<int>(batch_starts[n + 1]);
+
+    phi::DenseTensor gate = batch_gate->Slice(bstart, bend);
+    phi::DenseTensor cell = batch_cell.Slice(bstart, bend);
+    phi::DenseTensor cell_pre_act = batch_cell_pre_act->Slice(bstart, bend);
+    lstm_value.gate_value = gate.data<T>();
+    lstm_value.state_value = cell.data<T>();
+    lstm_value.state_active_value = cell_pre_act.data<T>();
+
+    phi::DenseTensor out_g = batch_hidden_g.Slice(bstart, bend);
+    phi::DenseTensor gate_g = batch_gate_g.Slice(bstart, bend);
+    phi::DenseTensor cell_g = batch_cell_g.Slice(bstart, bend);
+    lstm_grad.state_grad = cell_g.data<T>();
+    lstm_grad.gate_grad = gate_g.data<T>();
+    lstm_grad.output_grad = out_g.data<T>();
+
+    if (n > 0) {
+      int bstart_pre = static_cast<int>(batch_starts[n - 1]);
+      phi::DenseTensor cell_pre = batch_cell.Slice(bstart_pre, bstart);
+      phi::DenseTensor cell_pre_g = batch_cell_g.Slice(bstart_pre, bstart);
+      lstm_value.prev_state_value = cell_pre.data<T>();
+      lstm_grad.prev_state_grad = cell_pre_g.data<T>();
+    } else {
+      lstm_value.prev_state_value = c0 ? ordered_c0.data<T>() : nullptr;
+      lstm_grad.prev_state_grad = c0_g ? ordered_c0_g.data<T>() : nullptr;
+    }
+
+    // lstm_value.output_value not used in bp, set to nullptr
+    // lstm_grad.state_active_grad not used in bp, set to nullptr
+    lstm_value.output_value = nullptr;
+    lstm_grad.state_active_grad = nullptr;
+    int cur_batch_size = bend - bstart;
+    T cell_clip = 0.0;
+    phi::funcs::LstmUnitGradFunctor<Context, T>::compute(dev_ctx,
+                                                         lstm_value,
+                                                         lstm_grad,
+                                                         frame_size,
+                                                         cur_batch_size,
+                                                         cell_clip,
+                                                         gate_act,
+                                                         cell_act,
+                                                         cand_act);
+
+    if (n > 0) {
+      int pre_h_start = static_cast<int>(batch_starts[n - 1]);
+      int pre_h_end = pre_h_start + cur_batch_size;
+      auto pre_hidden_g = batch_hidden_g.Slice(pre_h_start, pre_h_end);
+      blas.MatMul(gate_g,
+                  false,
+                  *weight,
+                  true,
+                  static_cast<T>(1.0),
+                  &pre_hidden_g,
+                  static_cast<T>(1.0));
+      if (weight_g) {
+        /* backward weight */
+        auto pre_hidden = batch_hidden.Slice(pre_h_start, pre_h_end);
+        blas.MatMul(pre_hidden,
+                    true,
+                    gate_g,
+                    false,
+                    static_cast<T>(1.0),
+                    weight_g,
+                    static_cast<T>(1.0));
+      }
+    } else {
+      if (h0 && weight_g) {
+        ReorderInitState<Context, T>(dev_ctx, *h0, order, &ordered_h0, true);
+        blas.MatMul(ordered_h0,
+                    true,
+                    gate_g,
+                    false,
+                    static_cast<T>(1.0),
+                    weight_g,
+                    static_cast<T>(1.0));
+      }
+      if (h0 && h0_g) {
+        ordered_h0_g.Resize(h0_g->dims());
+        dev_ctx.template Alloc<T>(&ordered_h0_g);
+        blas.MatMul(gate_g,
+                    false,
+                    *weight,
+                    true,
+                    static_cast<T>(1.0),
+                    &ordered_h0_g,
+                    static_cast<T>(0.0));
+      }
+    }
+  }
+
+  phi::funcs::Batch2LoDTensorFunctor<Context, T> to_seq;
+  if (in_g) {
+    /* backward data */
+    dev_ctx.template Alloc<T>(in_g);
+    to_seq(dev_ctx, batch_gate_g, in_g);
+  }
+  if (bias && bias_g) {
+    /* backward bias */
+    phi::DenseTensor b_g = *bias_g;
+    b_g.Resize({bias_g->numel(), 1});
+    phi::DenseTensor gate_bias_g = b_g.Slice(0, 4 * frame_size);
+    phi::funcs::ColwiseSum<Context, T> col_sum;
+    col_sum(dev_ctx, batch_gate_g, &gate_bias_g);
+  }
+
+  if (h0 && h0_g) {
+    ReorderInitState<Context, T>(dev_ctx, ordered_h0_g, order, h0_g, false);
+  }
+  if (c0 && c0_g) {
+    ReorderInitState<Context, T>(dev_ctx, ordered_c0_g, order, c0_g, false);
+  }
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/lstm_kernel.h b/paddle/phi/kernels/lstm_kernel.h
new file mode 100644
index 0000000000000..42195e375c3a9
--- /dev/null
+++ b/paddle/phi/kernels/lstm_kernel.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LSTMKernel(const Context& dev_ctx,
+                const DenseTensor& input,
+                const paddle::optional<DenseTensor>& h0,
+                const paddle::optional<DenseTensor>& c0,
+                const DenseTensor& weight,
+                const DenseTensor& bias,
+                bool use_peepholes,
+                bool is_reverse,
+                bool is_test,
+                const std::string& gate_activation,
+                const std::string& cell_activation,
+                const std::string& candidate_activation,
+                DenseTensor* hidden,
+                DenseTensor* cell,
+                DenseTensor* batch_gate,
+                DenseTensor* batch_cell_pre_act);
+
+template <typename T, typename Context>
+void LSTMGradKernel(const Context& dev_ctx,
+                    const DenseTensor& input,
+                    const paddle::optional<DenseTensor>& h0,
+                    const paddle::optional<DenseTensor>& c0,
+                    const DenseTensor& weight,
+                    const DenseTensor& bias,
+                    const DenseTensor& hidden,
+                    const DenseTensor& cell,
+                    const DenseTensor& batch_gate,
+                    const DenseTensor& batch_cell_pre_act,
+                    const DenseTensor& hidden_grad,
+                    bool use_peepholes,
+                    bool is_reverse,
+                    bool is_test,
+                    const std::string& gate_activation,
+                    const std::string& cell_activation,
+                    const std::string& candidate_activation,
+                    DenseTensor* input_grad,
+                    DenseTensor* h0_grad,
+                    DenseTensor* c0_grad,
+                    DenseTensor* weight_grad,
+                    DenseTensor* bias_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/reduce_kernel_impl.cc b/paddle/phi/kernels/reduce_kernel_impl.cc
index 000cb99034c26..9319248099903 100644
--- a/paddle/phi/kernels/reduce_kernel_impl.cc
+++ b/paddle/phi/kernels/reduce_kernel_impl.cc
@@ -20,10 +20,16 @@ namespace phi {
 // oneDNN's reduction kernel is optimized only for reducing throughout the
 // most outer dims, so in case of another type of reduction, it would be
 // better to fallback to native implementation
-inline bool HasOptimizedOneDNNKernel(const KernelContext* ctx) {
+inline bool HasOptimizedOneDNNKernel(const KernelContext* ctx,
+                                     const bool mean_op) {
   const DenseTensor& x = ctx->InputAt<phi::DenseTensor>(0);
-  const TensorRef& dims_tmp = ctx->AttrAt<TensorRef>(0);
-  IntArray dims_array = IntArray(*dims_tmp.Get());
+  IntArray dims_array;
+  if (mean_op) {
+    dims_array = ctx->AttrAt<IntArray>(0);
+  } else {
+    const TensorRef& dims_tmp = ctx->AttrAt<TensorRef>(0);
+    dims_array = IntArray(*dims_tmp.Get());
+  }
   int ndims = x.dims().size();
   const bool reduce_all = recompute_reduce_all(x, dims_array);
   auto dims = dims_array.GetData();
@@ -53,7 +59,15 @@ inline bool HasOptimizedOneDNNKernel(const KernelContext* ctx) {
 
 bool ReduceCheckIfOneDNNSupport(const KernelContext* ctx) {
   if (ctx->InputAt<phi::DenseTensor>(0).dims().size() > 5 ||
-      !HasOptimizedOneDNNKernel(ctx)) {
+      !HasOptimizedOneDNNKernel(ctx, false)) {
+    return false;
+  }
+  return true;
+}
+
+bool ReduceMeanCheckIfOneDNNSupport(const KernelContext* ctx) {
+  if (ctx->InputAt<phi::DenseTensor>(0).dims().size() > 5 ||
+      !HasOptimizedOneDNNKernel(ctx, true)) {
     return false;
   }
   return true;
diff --git a/paddle/phi/kernels/reduce_kernel_impl.h b/paddle/phi/kernels/reduce_kernel_impl.h
index aef4f57ddbdcf..e117f6ab335dd 100644
--- a/paddle/phi/kernels/reduce_kernel_impl.h
+++ b/paddle/phi/kernels/reduce_kernel_impl.h
@@ -21,4 +21,6 @@ bool ReduceCheckIfOneDNNSupport(const KernelContext* ctx);
 
 bool ReduceGradCheckIfOneDNNSupport(const KernelContext* ctx);
 
+bool ReduceMeanCheckIfOneDNNSupport(const KernelContext* ctx);
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/reduce_mean_kernel.cc b/paddle/phi/kernels/reduce_mean_kernel.cc
index 16b3abf0e2931..a657e7ba8c01d 100644
--- a/paddle/phi/kernels/reduce_mean_kernel.cc
+++ b/paddle/phi/kernels/reduce_mean_kernel.cc
@@ -67,7 +67,7 @@ PD_REGISTER_KERNEL(mean, KPS, ALL_LAYOUT, phi::MeanKernel, float) {}
 #if defined(PADDLE_WITH_DNNL)
 PD_REGISTER_KERNEL(
     mean, OneDNN, ONEDNN, phi::MeanKernel, float, phi::dtype::bfloat16) {
-  kernel->check_if_onednn_kernel_support_ = phi::ReduceCheckIfOneDNNSupport;
+  kernel->check_if_onednn_kernel_support_ = phi::ReduceMeanCheckIfOneDNNSupport;
 }
 #endif
 
diff --git a/paddle/phi/kernels/sparse/gpu/conv_kernel_impl.cuh b/paddle/phi/kernels/sparse/gpu/conv_kernel_impl.cuh
index 775c23def14b0..3b6de498ef5b5 100644
--- a/paddle/phi/kernels/sparse/gpu/conv_kernel_impl.cuh
+++ b/paddle/phi/kernels/sparse/gpu/conv_kernel_impl.cuh
@@ -566,7 +566,7 @@ __global__ void __launch_bounds__(128) conv_forward_cuda_setting3_mode0_f16f16f3
 
 // conv_forward_cuda_m128n16k16_f32f32f32
 template <int K_ld_factor, int N_ld_factor, bool K_ld_check, bool N_ld_check>
-__global__ void __launch_bounds__(64) conv_forward_cuda_setting1_mode0_f32f32f32(int M, int K_original, int N, int kernel_volume, float* __restrict__ A, float* __restrict__ B, int* __restrict__ out_in_map, float* __restrict__ C) 
+__global__ void __launch_bounds__(64) conv_forward_cuda_setting1_mode0_f32f32f32(int M, int K_original, int N, int kernel_volume, float* __restrict__ A, float* __restrict__ B, int* __restrict__ out_in_map, float* __restrict__ C)
 {
 
   const int K_tile = 16;
@@ -578,27 +578,27 @@ __global__ void __launch_bounds__(64) conv_forward_cuda_setting1_mode0_f32f32f32
   __shared__ float B_shared[256];
 
   #pragma unroll
-  for (int i = 0; i < 32; ++i)   
+  for (int i = 0; i < 32; ++i)
   {
     C_local[i] = 0.0;
   }
-  
+
   int K_loops = K_implicit / 16;
-  int block_num_n = (N - 1) / 16 + 1; 
+  int block_num_n = (N - 1) / 16 + 1;
   int blockIdx_m = (int)blockIdx.x / block_num_n;
   int blockIdx_n = (int)blockIdx.x % block_num_n;
   int threadIdx_x = (int)threadIdx.x;
 
   // hoisting shared pointer offsets
-  int * out_in_map_ptr = out_in_map 
-                         + (blockIdx_m * 128 + (threadIdx_x / (16/4)))* kernel_volume;  
+  int * out_in_map_ptr = out_in_map
+                         + (blockIdx_m * 128 + (threadIdx_x / (16/4)))* kernel_volume;
 
-  float * B_ptr = B 
-                  + (threadIdx_x / (16/4)) * N 
-                  + (blockIdx_n * 16) + ((threadIdx_x * 4) % 16); 
+  float * B_ptr = B
+                  + (threadIdx_x / (16/4)) * N
+                  + (blockIdx_n * 16) + ((threadIdx_x * 4) % 16);
 
   float * A_shared_ptr = A_shared + (threadIdx_x * 4);
-  float * A_shared_reduce_ptr =  A_shared + ((threadIdx_x / 4) * 16); 
+  float * A_shared_reduce_ptr =  A_shared + ((threadIdx_x / 4) * 16);
   float * B_shared_ptr = B_shared + (threadIdx_x * 4);
   float * B_shared_reduce_ptr = B_shared + (threadIdx_x % 4);
 
@@ -648,7 +648,7 @@ __global__ void __launch_bounds__(64) conv_forward_cuda_setting1_mode0_f32f32f32
       }
 
       int* out_in_map_ptr_local = out_in_map_ptr + k_0 * 16 / K_tile_padded;
-      float* A_ptr_local = A  + (k_0 * 16 % K_tile_padded) + channel_offset_A;  
+      float* A_ptr_local = A  + (k_0 * 16 % K_tile_padded) + channel_offset_A;
 
       float* B_ptr_local;
       if constexpr (K_ld_check)
@@ -661,14 +661,14 @@ __global__ void __launch_bounds__(64) conv_forward_cuda_setting1_mode0_f32f32f32
       for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 8; ++ax0_ax1_fused_0)
       {
 
-        int input_idx = *(out_in_map_ptr_local + (ax0_ax1_fused_0 *16) * kernel_volume); 
+        int input_idx = *(out_in_map_ptr_local + (ax0_ax1_fused_0 *16) * kernel_volume);
         if (input_idx != -1)
         {
           uint4 A_loaded = make_uint4(0, 0, 0, 0);
           global_load<K_ld_factor>(A_loaded, A_ptr_local + (input_idx * K_original) , A_pred_guard);
           *(uint4 *)(A_shared_ptr + (ax0_ax1_fused_0 * 256)) = A_loaded;
         }
-        else 
+        else
         {
           *(uint4*)(A_shared_ptr + (ax0_ax1_fused_0 * 256)) = make_uint4(0, 0, 0, 0);
         }
@@ -678,23 +678,23 @@ __global__ void __launch_bounds__(64) conv_forward_cuda_setting1_mode0_f32f32f32
       for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 1; ++ax0_ax1_fused_0_1)
       {
         uint4 B_loaded = make_uint4(0, 0, 0, 0);
-        global_load<N_ld_factor>(B_loaded, B_ptr_local + (ax0_ax1_fused_0_1 * 16) * N, B_pred_guard); 
+        global_load<N_ld_factor>(B_loaded, B_ptr_local + (ax0_ax1_fused_0_1 * 16) * N, B_pred_guard);
         *(uint4 *)(B_shared_ptr + (ax0_ax1_fused_0_1 * 256)) = B_loaded;
       }
 
       __syncthreads();
       #pragma unroll
-      for (int k_1 = 0; k_1 < ( 16 / 4); ++k_1) 
+      for (int k_1 = 0; k_1 < ( 16 / 4); ++k_1)
       {
         #pragma unroll
-        for (int k_2 = 0; k_2 < 4; ++k_2) 
+        for (int k_2 = 0; k_2 < 4; ++k_2)
         {
           int vk_in_block = (k_1 << 2) + k_2;
           #pragma unroll
-          for (int i = 0; i < 32; ++i) 
+          for (int i = 0; i < 32; ++i)
           {
-            C_local[i] = C_local[i] + 
-                            A_shared_reduce_ptr[((i / 4) * 16) * 16 + vk_in_block] 
+            C_local[i] = C_local[i] +
+                            A_shared_reduce_ptr[((i / 4) * 16) * 16 + vk_in_block]
                             * B_shared_reduce_ptr[(vk_in_block * 16) + ((i % 4) * 4)];
 
           }
@@ -707,7 +707,7 @@ __global__ void __launch_bounds__(64) conv_forward_cuda_setting1_mode0_f32f32f32
   for (int i = 0; i < 32; ++i)
   {
       int location_cur = location_offset + ((i / 4) * 16);
-      int vn = C_n_offset + ((i % 4) * 4); 
+      int vn = C_n_offset + ((i % 4) * 4);
 
       if constexpr (N_ld_check)
       {
@@ -723,34 +723,34 @@ __global__ void __launch_bounds__(64) conv_forward_cuda_setting1_mode0_f32f32f32
 }
 
 // conv_forward_cuda_m128n16k32_f32f32f32
-__global__ void __launch_bounds__(64) conv_forward_cuda_setting2_mode0_f32f32f32(int M, int K_original, int N, int kernel_volume, float* __restrict__ A, float* __restrict__ B, int* __restrict__ out_in_map, float* __restrict__ C) 
+__global__ void __launch_bounds__(64) conv_forward_cuda_setting2_mode0_f32f32f32(int M, int K_original, int N, int kernel_volume, float* __restrict__ A, float* __restrict__ B, int* __restrict__ out_in_map, float* __restrict__ C)
 {
   float C_local[32];
   __shared__ float A_shared[4096];
   __shared__ float B_shared[512];
 
   #pragma unroll
-  for (int i = 0; i < 32; ++i)   
+  for (int i = 0; i < 32; ++i)
   {
     C_local[i] = 0.0;
   }
-  
+
   int K_loops = (K_original * kernel_volume - 1) / 32 + 1;
-  int block_num_n = (N - 1) / 16 + 1; 
+  int block_num_n = (N - 1) / 16 + 1;
   int blockIdx_m = (int)blockIdx.x / block_num_n;
   int blockIdx_n = (int)blockIdx.x % block_num_n;
   int threadIdx_x = (int)threadIdx.x;
 
   // hoisting shared pointer offsets
-  int * out_in_map_ptr = out_in_map 
-                         + (blockIdx_m * 128 + (threadIdx_x / (32/4)))* kernel_volume;  
+  int * out_in_map_ptr = out_in_map
+                         + (blockIdx_m * 128 + (threadIdx_x / (32/4)))* kernel_volume;
 
-  float * B_ptr = B 
-                  + (threadIdx_x / (16/4)) * N 
-                  + (blockIdx_n * 16) + ((threadIdx_x * 4) % 16); 
+  float * B_ptr = B
+                  + (threadIdx_x / (16/4)) * N
+                  + (blockIdx_n * 16) + ((threadIdx_x * 4) % 16);
 
   float * A_shared_ptr = A_shared + (threadIdx_x * 4);
-  float * A_shared_reduce_ptr =  A_shared + ((threadIdx_x / 4) * 32); 
+  float * A_shared_reduce_ptr =  A_shared + ((threadIdx_x / 4) * 32);
   float * B_shared_ptr = B_shared + (threadIdx_x * 4);
   float * B_shared_reduce_ptr = B_shared + (threadIdx_x % 4);
 
@@ -762,7 +762,7 @@ __global__ void __launch_bounds__(64) conv_forward_cuda_setting2_mode0_f32f32f32
   #pragma unroll
   for (int k_0 = 0; k_0 < K_loops; ++k_0) {
 
-    int channel_offset = k_0 % (K_original / 32) * 32 + channel_offset_A; 
+    int channel_offset = k_0 % (K_original / 32) * 32 + channel_offset_A;
     int kernel_offset = k_0 / (K_original / 32);
     int *out_in_map_ptr_k = out_in_map_ptr + kernel_offset;
 
@@ -772,8 +772,8 @@ __global__ void __launch_bounds__(64) conv_forward_cuda_setting2_mode0_f32f32f32
       for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 16; ++ax0_ax1_fused_0)
       {
 
-        int input_idx = *(out_in_map_ptr_k + (ax0_ax1_fused_0 *8) * kernel_volume); 
-        if (input_idx != -1) 
+        int input_idx = *(out_in_map_ptr_k + (ax0_ax1_fused_0 *8) * kernel_volume);
+        if (input_idx != -1)
         {
 
           *(float4*)(A_shared_ptr + (ax0_ax1_fused_0 * 256)) =  // ax0_ax1_fused_0 * elements loaded in each loop
@@ -788,27 +788,27 @@ __global__ void __launch_bounds__(64) conv_forward_cuda_setting2_mode0_f32f32f32
       }
 
       #pragma unroll
-      for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 2; ++ax0_ax1_fused_0_1)    
+      for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 2; ++ax0_ax1_fused_0_1)
       {
 
         *(float4*)(B_shared_ptr + (ax0_ax1_fused_0_1 * 256)) =                 // ax0_ax1_fused_0_1 * elements loaded in each loop
-              *(float4*)(B_ptr + ((k_0 * 32) + (ax0_ax1_fused_0_1 * 16)) * N); 
+              *(float4*)(B_ptr + ((k_0 * 32) + (ax0_ax1_fused_0_1 * 16)) * N);
 
       }
 
       __syncthreads();
       #pragma unroll
-      for (int k_1 = 0; k_1 < ( 32 / 4); ++k_1) 
+      for (int k_1 = 0; k_1 < ( 32 / 4); ++k_1)
       {
         #pragma unroll
-        for (int k_2 = 0; k_2 < 4; ++k_2) 
+        for (int k_2 = 0; k_2 < 4; ++k_2)
         {
           int vk_in_block = (k_1 << 2) + k_2;
           #pragma unroll
-          for (int i = 0; i < 32; ++i) 
+          for (int i = 0; i < 32; ++i)
           {
-            C_local[i] = C_local[i] + 
-                            A_shared_reduce_ptr[((i / 4) * 16) * 32 + vk_in_block] 
+            C_local[i] = C_local[i] +
+                            A_shared_reduce_ptr[((i / 4) * 16) * 32 + vk_in_block]
                             * B_shared_reduce_ptr[(vk_in_block * 16) + ((i % 4) * 4)];
 
           }
@@ -818,44 +818,44 @@ __global__ void __launch_bounds__(64) conv_forward_cuda_setting2_mode0_f32f32f32
   }
 
   #pragma unroll
-  for (int i = 0; i < 32; ++i) 
+  for (int i = 0; i < 32; ++i)
   {
       int location_cur = location_offset + ((i / 4) * 16);
-      int vn = C_n_offset + ((i % 4) * 4); 
+      int vn = C_n_offset + ((i % 4) * 4);
       if (location_cur < M)
         C[location_cur * N + vn] = C_local[i];
    }
 }
 
 // conv_forward_cuda_m128n64k32_f32f32f32
-__global__ void __launch_bounds__(128) conv_forward_cuda_setting3_mode0_f32f32f32(int M, int K_original, int N, int kernel_volume, float* __restrict__ A, float* __restrict__ B, int* __restrict__ out_in_map, float* __restrict__ C) 
+__global__ void __launch_bounds__(128) conv_forward_cuda_setting3_mode0_f32f32f32(int M, int K_original, int N, int kernel_volume, float* __restrict__ A, float* __restrict__ B, int* __restrict__ out_in_map, float* __restrict__ C)
 {
   float C_local[64];
   __shared__ float A_shared[4096];
   __shared__ float B_shared[2048];
 
   #pragma unroll
-  for (int i = 0; i < 64; ++i)   
+  for (int i = 0; i < 64; ++i)
   {
     C_local[i] = 0.0;
   }
-  
+
   int K_loops = (K_original * kernel_volume - 1) / 32 + 1;
-  int block_num_n = (N - 1) / 64 + 1; 
+  int block_num_n = (N - 1) / 64 + 1;
   int blockIdx_m = (int)blockIdx.x / block_num_n;
   int blockIdx_n = (int)blockIdx.x % block_num_n;
   int threadIdx_x = (int)threadIdx.x;
 
   // hoisting shared pointer offsets
-  int * out_in_map_ptr = out_in_map 
-                         + (blockIdx_m * 128 + (threadIdx_x / (32/4)))* kernel_volume;  
+  int * out_in_map_ptr = out_in_map
+                         + (blockIdx_m * 128 + (threadIdx_x / (32/4)))* kernel_volume;
 
-  float * B_ptr = B 
-                  + (threadIdx_x / (64/4)) * N 
-                  + (blockIdx_n * 64) + ((threadIdx_x * 4) % 64); 
+  float * B_ptr = B
+                  + (threadIdx_x / (64/4)) * N
+                  + (blockIdx_n * 64) + ((threadIdx_x * 4) % 64);
 
   float * A_shared_ptr = A_shared + (threadIdx_x * 4);
-  float * A_shared_reduce_ptr =  A_shared + ((threadIdx_x / 16) * 32); 
+  float * A_shared_reduce_ptr =  A_shared + ((threadIdx_x / 16) * 32);
   float * B_shared_ptr = B_shared + (threadIdx_x * 4);
   float * B_shared_reduce_ptr = B_shared + (threadIdx_x % 16);
 
@@ -867,7 +867,7 @@ __global__ void __launch_bounds__(128) conv_forward_cuda_setting3_mode0_f32f32f3
   #pragma unroll
   for (int k_0 = 0; k_0 < K_loops; ++k_0) {
 
-    int channel_offset = k_0 % (K_original / 32) * 32 + channel_offset_A; 
+    int channel_offset = k_0 % (K_original / 32) * 32 + channel_offset_A;
     int kernel_offset = k_0 / (K_original / 32);
     int *out_in_map_ptr_k = out_in_map_ptr + kernel_offset;
 
@@ -877,8 +877,8 @@ __global__ void __launch_bounds__(128) conv_forward_cuda_setting3_mode0_f32f32f3
       for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 8; ++ax0_ax1_fused_0)
       {
 
-        int input_idx = *(out_in_map_ptr_k + (ax0_ax1_fused_0 *16) * kernel_volume); 
-        if (input_idx != -1) 
+        int input_idx = *(out_in_map_ptr_k + (ax0_ax1_fused_0 *16) * kernel_volume);
+        if (input_idx != -1)
         {
 
           *(float4*)(A_shared_ptr + (ax0_ax1_fused_0 * 512)) =  // ax0_ax1_fused_0 * elements loaded in each loop
@@ -893,27 +893,27 @@ __global__ void __launch_bounds__(128) conv_forward_cuda_setting3_mode0_f32f32f3
       }
 
       #pragma unroll
-      for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 4; ++ax0_ax1_fused_0_1)    
+      for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 4; ++ax0_ax1_fused_0_1)
       {
 
         *(float4*)(B_shared_ptr + (ax0_ax1_fused_0_1 * 512)) =                 // ax0_ax1_fused_0_1 * elements loaded in each loop
-              *(float4*)(B_ptr + ((k_0 * 32) + (ax0_ax1_fused_0_1 * 8)) * N); 
+              *(float4*)(B_ptr + ((k_0 * 32) + (ax0_ax1_fused_0_1 * 8)) * N);
 
       }
 
       __syncthreads();
       #pragma unroll
-      for (int k_1 = 0; k_1 < ( 32 / 4); ++k_1) 
+      for (int k_1 = 0; k_1 < ( 32 / 4); ++k_1)
       {
         #pragma unroll
-        for (int k_2 = 0; k_2 < 4; ++k_2) 
+        for (int k_2 = 0; k_2 < 4; ++k_2)
         {
           int vk_in_block = (k_1 << 2) + k_2;
           #pragma unroll
-          for (int i = 0; i < 64; ++i) 
+          for (int i = 0; i < 64; ++i)
           {
-            C_local[i] = C_local[i] + 
-                            A_shared_reduce_ptr[((i / 4) * 8) * 32 + vk_in_block] 
+            C_local[i] = C_local[i] +
+                            A_shared_reduce_ptr[((i / 4) * 8) * 32 + vk_in_block]
                             * B_shared_reduce_ptr[(vk_in_block * 64) + ((i % 4) * 16)];
 
           }
@@ -923,10 +923,10 @@ __global__ void __launch_bounds__(128) conv_forward_cuda_setting3_mode0_f32f32f3
   }
 
   #pragma unroll
-  for (int i = 0; i < 64; ++i) 
+  for (int i = 0; i < 64; ++i)
   {
       int location_cur = location_offset + ((i / 4) * 8);
-      int vn = C_n_offset + ((i % 4) * 16); 
+      int vn = C_n_offset + ((i % 4) * 16);
       if (location_cur < M)
         C[location_cur * N + vn] = C_local[i];
    }
@@ -944,10 +944,10 @@ void conv_forward_implicit_gemm_cuda(
   auto compute_capability = dev_ctx.GetComputeCapability();
   bool allow_fp16 = compute_capability >= 75;
   bool is_half = _in_feats.dtype() == phi::DataType::FLOAT16;
-  
+
   int num_in_feats = _in_feats.dims()[0];
   int num_in_channels = _in_feats.dims()[1];
-  
+
   int kernel_volume = _out_in_map.dims()[1];
   auto out_in_map = const_cast<int*>(_out_in_map.data<int>());
 
@@ -1141,7 +1141,7 @@ void conv_forward_implicit_gemm_cuda(
     {
       int block_num_M = (num_out_feats + 127) / 128;
       int block_num_N = num_out_channels / 64;  //j_factors1
-      dim3 num_blocks(block_num_M * block_num_N); 
+      dim3 num_blocks(block_num_M * block_num_N);
       dim3 threads_per_block(128);
       conv_forward_cuda_setting3_mode0_f32f32f32<<<num_blocks, threads_per_block, 0, dev_ctx.stream()>>>(
           _out_feats.dims()[0], num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
@@ -1150,7 +1150,7 @@ void conv_forward_implicit_gemm_cuda(
     {
       int block_num_M = (num_out_feats + 127) / 128;
       int block_num_N = num_out_channels / 16;  //j_factors1
-      dim3 num_blocks(block_num_M * block_num_N); 
+      dim3 num_blocks(block_num_M * block_num_N);
       dim3 threads_per_block(64);
       conv_forward_cuda_setting2_mode0_f32f32f32<<<num_blocks, threads_per_block, 0, dev_ctx.stream()>>>(
           _out_feats.dims()[0], num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
@@ -1159,7 +1159,7 @@ void conv_forward_implicit_gemm_cuda(
     {
       int block_num_M = (num_out_feats + 127) / 128;
       int block_num_N = (num_out_channels + 15) / 16;  //j_factors1
-      dim3 num_blocks(block_num_M * block_num_N); 
+      dim3 num_blocks(block_num_M * block_num_N);
       dim3 threads_per_block(64);
 
       if (num_in_channels % 16 == 0)
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_conv_hashmap.cuh b/paddle/phi/kernels/sparse/gpu/sparse_conv_hashmap.cuh
index 73ad53de502da..380abb419b40a 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_conv_hashmap.cuh
+++ b/paddle/phi/kernels/sparse/gpu/sparse_conv_hashmap.cuh
@@ -65,7 +65,7 @@ class GPUHashTable {
   key_type* table_keys;
   val_type* table_vals;
   void insert_many_coords(const phi::GPUContext& dev_ctx, const int *coords, const int n);
-  void lookup_many_coords(const phi::GPUContext& dev_ctx, const int *coords, val_type *results, 
+  void lookup_many_coords(const phi::GPUContext& dev_ctx, const int *coords, val_type *results,
     const int* kernel_sizes, const int* tensor_strides,
     const int n, const int kernel_volume);
  public:
@@ -112,8 +112,8 @@ __global__ void insert_coords_kernel(key_type* table_keys, val_type* table_vals,
 
 template <typename key_type=int64_t, typename val_type=int, bool odd>
 __global__ void lookup_coords_kernel(
-  key_type* table_keys, val_type* table_vals, const int* coords, val_type* vals, 
-  const int* kernel_sizes, const int* strides, 
+  key_type* table_keys, val_type* table_vals, const int* coords, val_type* vals,
+  const int* kernel_sizes, const int* strides,
   int n, int _capacity, int kernel_volume, int _width)
 {
     int tidx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -125,8 +125,8 @@ __global__ void lookup_coords_kernel(
     //coords_out[2] = in_coords[2];
     //coords_out[3] = in_coords[3];
     coords_out[0] = in_coords[0];
-    
-    if constexpr (odd) 
+
+    if constexpr (odd)
     {
       #pragma unroll
       for(int i = 0; i <= _width-2; i++){
@@ -146,7 +146,7 @@ __global__ void lookup_coords_kernel(
         _kernel_idx /= kernel_sizes[i];
       }
     }
-    
+
     if (idx < n)
     {
         key_type key = (key_type)(hash_func_64b(coords_out, _width));
@@ -156,7 +156,7 @@ __global__ void lookup_coords_kernel(
         {
             key_type cur_key = table_keys[slot];
             if (key == cur_key)
-            { 
+            {
                 vals[idx * kernel_volume + kernel_idx] = table_vals[slot] - 1; // need to subtract 1 to avoid extra operations in python
             }
             if (table_keys[slot] == EMPTY_CELL)
@@ -181,7 +181,7 @@ void GPUHashTable<key_type, val_type>::insert_coords(const phi::GPUContext& dev_
 template <typename key_type, typename val_type>
 void GPUHashTable<key_type, val_type>::lookup_many_coords(
   const phi::GPUContext& dev_ctx,
-  const int* coords, val_type* results, 
+  const int* coords, val_type* results,
   const int* kernel_sizes, const int* strides,
   const int n, const int kernel_volume){
   if (kernel_volume % 2)
diff --git a/paddle/phi/kernels/xpu/rms_norm_kernel.cc b/paddle/phi/kernels/xpu/rms_norm_kernel.cc
index 698b2b195da82..85a4ea7291a14 100644
--- a/paddle/phi/kernels/xpu/rms_norm_kernel.cc
+++ b/paddle/phi/kernels/xpu/rms_norm_kernel.cc
@@ -63,10 +63,10 @@ void RmsNormKernel(const Context& dev_ctx,
   const T* norm_weight_data = norm_weight.data<T>();
   const T* norm_bias_data = norm_bias ? norm_bias.get().data<T>() : nullptr;
   // float* inv_var_data = nullptr;
-  if (inv_var != nullptr) {
-    // inv_var_data = dev_ctx.template Alloc<float>(inv_var);
-    PD_THROW("rms_norm in XPU kernel does not support inv_var output");
-  }
+  // if (inv_var != nullptr) {
+  // inv_var_data = dev_ctx.template Alloc<float>(inv_var);
+  // PD_THROW("rms_norm in XPU kernel does not support inv_var output");
+  // }
 
   int32_t rows = 1;
   int32_t cols = 1;
diff --git a/paddle/phi/kernels/xpu/swiglu_kernel.cc b/paddle/phi/kernels/xpu/swiglu_kernel.cc
index a7815931fa6a8..9ba9c10ea1a43 100644
--- a/paddle/phi/kernels/xpu/swiglu_kernel.cc
+++ b/paddle/phi/kernels/xpu/swiglu_kernel.cc
@@ -50,7 +50,7 @@ void SwiGluKernel(const Context& ctx,
                         reinterpret_cast<XPUType*>(z_data),
                         dims_vec,
                         axis,
-                        false,
+                        true,
                         const_nullptr,
                         nullptr,
                         y_ptr);
diff --git a/paddle/phi/kernels/xpu/swiglu_kernel_grad.cc b/paddle/phi/kernels/xpu/swiglu_kernel_grad.cc
index 994699a9fa63a..290081a48f36d 100644
--- a/paddle/phi/kernels/xpu/swiglu_kernel_grad.cc
+++ b/paddle/phi/kernels/xpu/swiglu_kernel_grad.cc
@@ -64,7 +64,7 @@ void SwiGluGradKernel(const Context& ctx,
                              reinterpret_cast<XPUType*>(dx_data),
                              dims_vec,
                              axis,
-                             false,
+                             true,
                              y_ptr,
                              dy_ptr);
   PADDLE_ENFORCE_XDNN_SUCCESS(ret, "swiglu_grad");
diff --git a/paddle/phi/kernels/xpu/tile_kernel.cc b/paddle/phi/kernels/xpu/tile_kernel.cc
index 5e665711efc8d..6b8dbf641f803 100644
--- a/paddle/phi/kernels/xpu/tile_kernel.cc
+++ b/paddle/phi/kernels/xpu/tile_kernel.cc
@@ -143,4 +143,5 @@ PD_REGISTER_KERNEL(tile,
                    double,
                    int,
                    int64_t,
-                   phi::dtype::bfloat16) {}
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/ops/yaml/backward.yaml b/paddle/phi/ops/yaml/backward.yaml
index 934e55ad90a92..702745d436beb 100644
--- a/paddle/phi/ops/yaml/backward.yaml
+++ b/paddle/phi/ops/yaml/backward.yaml
@@ -1067,6 +1067,7 @@
   infer_meta :
     func :  KernelWithXShapeInferMeta
     param : [xshape, out_grad]
+    spmd_rule : FlattenGradInferSpmd
   kernel :
     func : flatten_grad
     data_type : out_grad
@@ -1825,6 +1826,22 @@
   kernel :
     func : logsumexp_grad
 
+- backward_op : lstm_grad
+  forward: lstm (Tensor input, Tensor h0, Tensor c0, Tensor weight, Tensor bias, bool use_peepholes
+    = true, bool is_reverse = false, bool is_test = false, str gate_activation = "sigmoid",
+    str cell_activation = "tanh", str candidate_activation = "tanh") -> Tensor (hidden), Tensor (cell), Tensor (batch_gate), Tensor (batch_cell_pre_act)
+  args: (Tensor input, Tensor h0, Tensor c0, Tensor weight, Tensor bias, Tensor hidden, Tensor cell,
+    Tensor batch_gate, Tensor batch_cell_pre_act, Tensor hidden_grad, bool use_peepholes, bool is_reverse, bool is_test, str gate_activation,
+    str cell_activation, str candidate_activation)
+  output: Tensor(input_grad), Tensor(h0_grad), Tensor(c0_grad), Tensor(weight_grad), Tensor(bias_grad)
+  infer_meta:
+    func: LSTMGradInferMeta
+    param: [input, h0, c0, weight, bias]
+  kernel:
+    func: lstm_grad
+    data_type: input
+  optional: h0, c0
+
 - backward_op : lu_grad
   forward : lu (Tensor x, bool pivot = true) -> Tensor(out), Tensor(pivots), Tensor(infos)
   args : (Tensor x, Tensor out, Tensor pivots, Tensor out_grad, bool pivot)
diff --git a/paddle/phi/ops/yaml/fused_ops.yaml b/paddle/phi/ops/yaml/fused_ops.yaml
index 5db39e9d207d7..1aac7524d84ab 100644
--- a/paddle/phi/ops/yaml/fused_ops.yaml
+++ b/paddle/phi/ops/yaml/fused_ops.yaml
@@ -56,6 +56,20 @@
   data_transform :
     skip_transform : max_enc_len_this_time, max_dec_len_this_time
 
+- op : block_multihead_attention_xpu
+  args : (Tensor qkv, Tensor key_cache, Tensor value_cache, Tensor seq_lens_encoder, Tensor seq_lens_decoder, Tensor seq_lens_this_time, Tensor padding_offsets, Tensor cum_offsets, Tensor cu_seqlens_q, Tensor cu_seqlens_k, Tensor block_tables, Tensor cache_k_per_batch_maxs, Tensor cache_v_per_batch_maxs, Tensor pre_key_cache, Tensor pre_value_cache, Tensor rope_emb, Tensor mask,  Tensor tgt_mask, Tensor cache_k_quant_scales, Tensor cache_v_quant_scales, Tensor cache_k_dequant_scales, Tensor cache_v_dequant_scales, Tensor qkv_out_scale, Tensor qkv_bias, Tensor out_shift, Tensor out_smooth, Tensor max_enc_len_this_time, Tensor max_dec_len_this_time, int max_seq_len, int block_size, bool use_neox_style, bool dynamic_cachekv_quant=false, int quant_round_type=1, float quant_max_bound=127.0, float quant_min_bound=-127.0, float out_scale=-1, str compute_dtype = "default")
+  output : Tensor(fmha_out), Tensor(qkv_out), Tensor(key_cache_out), Tensor(value_cache_out)
+  infer_meta :
+    func : BlockMultiheadAttentionInferXPUMeta
+  kernel :
+    func : block_multihead_attention_xpu
+    data_type : qkv
+  optional : pre_key_cache, pre_value_cache, rope_emb, mask, tgt_mask, cache_k_quant_scales, cache_v_quant_scales, cache_k_dequant_scales, cache_v_dequant_scales, qkv_out_scale, qkv_bias, out_shift, out_smooth, max_enc_len_this_time, max_dec_len_this_time
+  inplace : (qkv -> qkv_out), (key_cache -> key_cache_out), (value_cache -> value_cache_out)
+  support_dygraph_mode : true
+  data_transform :
+    skip_transform : max_enc_len_this_time, max_dec_len_this_time
+
 - op : bn_act_xpu
   args : (Tensor x, Tensor mean, Tensor variance, Tensor scale, Tensor bias, float momentum, float epsilon, str data_format, int act_type)
   output : Tensor(out)
@@ -400,6 +414,16 @@
     func: fused_token_prune
   support_dygraph_mode : true
 
+- op : fusion_group
+  args: (Tensor[] inputs, int[] outs_dtype = {}, int[] inputs_dtype = {}, str func_name = "", int type
+    = 0)
+  output: Tensor[] (outs){inputs.size()}
+  infer_meta:
+    func: FusionGroupInferMeta
+  kernel:
+    func: fusion_group
+    data_type : DataType::FLOAT32
+
 - op : fusion_gru
   args : (Tensor x, Tensor h0, Tensor weight_x, Tensor weight_h, Tensor bias, str activation = "tanh", str gate_activation = "sigmoid", bool is_reverse = false, bool use_seq = true, bool origin_mode = false, bool force_fp32_output = false)
   output : Tensor(reordered_h0), Tensor(xx), Tensor(batched_input), Tensor(batched_out), Tensor(hidden)
@@ -685,3 +709,15 @@
     func : yolo_box_xpu
     data_type : x
   optional : x_max
+
+- op: add_group_norm_silu
+  args : (Tensor x,Tensor residual, Tensor scale, Tensor bias, float epsilon = 1e-5, int groups = -1, str data_format = "NCHW", str activation = "")
+  output : Tensor(y), Tensor(residual_out), Tensor(mean), Tensor(variance)
+  infer_meta :
+    func : AddGroupNormSiluInferMeta
+  kernel :
+    func : add_group_norm_silu
+    data_type : x
+  optional : scale, bias, residual, residual_out
+  support_dygraph_mode : true
+  interfaces : paddle::dialect::LayoutTransformationInterface
diff --git a/paddle/phi/ops/yaml/inconsistent/static_ops.yaml b/paddle/phi/ops/yaml/inconsistent/static_ops.yaml
index ddfe98cefcc80..2f59244893ffc 100644
--- a/paddle/phi/ops/yaml/inconsistent/static_ops.yaml
+++ b/paddle/phi/ops/yaml/inconsistent/static_ops.yaml
@@ -49,14 +49,6 @@
   inplace : (x -> out)
   interfaces : paddle::dialect::InferSymbolicShapeInterface, paddle::dialect::LayoutTransformationInterface
 
-- op : assign_pos
-  args : (Tensor x, Tensor cum_count, Tensor eff_num_len)
-  output : Tensor(out)
-  infer_meta :
-    func : AssignPosInferMeta
-  kernel :
-    func : assign_pos
-
 - op : assign_value
   args : (int[] shape, DataType dtype, Scalar[] values, Place place = {})
   output : Tensor(out)
@@ -196,15 +188,6 @@
     data_type : dtype
   inplace: (input -> output)
 
-- op : decayed_adagrad
-  args : (Tensor param, Tensor grad, Tensor moment, Tensor learning_rate, float decay = 0.95f, float epsilon = 1.0e-6f)
-  output : Tensor(param_out), Tensor(moment_out)
-  infer_meta :
-    func : DecayedAdagradInferMeta
-  kernel :
-    func : decayed_adagrad
-    data_type : param
-
 - op : dequantize_linear
   args : (Tensor x, Tensor scale, Tensor zero_point, Tensor in_accum, Tensor in_state, int quant_axis = 0, int bit_length = 8, int round_type = 0, bool is_test = true, bool only_observer = false)
   output : Tensor(y), Tensor(out_state), Tensor(out_accum), Tensor(out_scale)
@@ -859,16 +842,6 @@
   backward : subtract_grad
   interfaces : paddle::dialect::InferSymbolicShapeInterface
 
-- op : tdm_sampler
-  args: (Tensor x, Tensor travel, Tensor layer, bool output_positive=true, int[] neg_samples_num_list={}, int[] layer_offset_lod={}, int seed = 0, int dtype=2)
-  output: Tensor(out), Tensor(labels), Tensor(mask)
-  infer_meta:
-    func : TdmSamplerInferMeta
-  kernel:
-    func : tdm_sampler
-    data_type : x
-  optional : labels
-
 - op : tile
   args : (Tensor x, IntArray repeat_times = {})
   output : Tensor(out)
@@ -1001,15 +974,6 @@
   optional: bias, sample_weight, custom_dist_probs, custom_dist_alias, custom_dist_alias_probs
   backward: nce_grad
 
-- op: number_count
-  args: (Tensor numbers, int upper_range)
-  output: Tensor(out)
-  infer_meta:
-     func: NumberCountInferMeta
-  kernel:
-     func: number_count
-     data_type: numbers
-
 - op: onednn_to_paddle_layout
   args: (Tensor x, int dst_layout)
   output: Tensor(out)
diff --git a/paddle/phi/ops/yaml/legacy/backward_exclude.yaml b/paddle/phi/ops/yaml/legacy/backward_exclude.yaml
index 335952bc3475c..9a327ef5dd4b3 100644
--- a/paddle/phi/ops/yaml/legacy/backward_exclude.yaml
+++ b/paddle/phi/ops/yaml/legacy/backward_exclude.yaml
@@ -5,7 +5,6 @@
 - amax_grad
 - amin_grad
 - cast_grad
-- channel_shuffle_grad
 - conv2d_transpose_double_grad
 - conv2d_transpose_grad
 - deformable_conv_grad
@@ -34,7 +33,6 @@
 - repeat_interleave_grad
 - repeat_interleave_with_tensor_index_grad
 - rnn_grad
-- rrelu_grad
 - set_value_with_tensor_grad
 - slice_double_grad
 - slice_grad
diff --git a/paddle/phi/ops/yaml/legacy/ops_exclude.yaml b/paddle/phi/ops/yaml/legacy/ops_exclude.yaml
index 160e33c5b36c8..703c948240df0 100644
--- a/paddle/phi/ops/yaml/legacy/ops_exclude.yaml
+++ b/paddle/phi/ops/yaml/legacy/ops_exclude.yaml
@@ -22,7 +22,6 @@
 - c_sync_calc_stream
 - c_sync_comm_stream
 - cast
-- channel_shuffle
 - conv2d_transpose
 - conv2d_transpose_bias
 - copy_to
@@ -75,7 +74,6 @@
 - repeat_interleave
 - repeat_interleave_with_tensor_index
 - rnn
-- rrelu
 - sequence_mask
 - set_value_with_tensor
 - slice
diff --git a/paddle/phi/ops/yaml/legacy/static_ops.yaml b/paddle/phi/ops/yaml/legacy/static_ops.yaml
index 1280fd3716f0a..d9d0c222b770f 100755
--- a/paddle/phi/ops/yaml/legacy/static_ops.yaml
+++ b/paddle/phi/ops/yaml/legacy/static_ops.yaml
@@ -699,6 +699,14 @@
     func : swish
   backward : swish_grad
 
+- op : transfer_layout
+  args: (Tensor x, int src_layout = -1, int dst_layout=-1)
+  output: Tensor (out)
+  infer_meta:
+    func: TransferLayoutInferMeta
+  kernel:
+    func: transfer_layout
+
 - op : tril_indices
   args : (int rows = 0, int cols = 0, int offset = 0, DataType dtype = DataType::INT64)
   output : Tensor(out)
diff --git a/paddle/phi/ops/yaml/op_compat.yaml b/paddle/phi/ops/yaml/op_compat.yaml
index 2f7af0b64c802..6fb5afeb87a07 100755
--- a/paddle/phi/ops/yaml/op_compat.yaml
+++ b/paddle/phi/ops/yaml/op_compat.yaml
@@ -4062,6 +4062,12 @@
   outputs :
     {slimmed_x : SlimmedX, cls_inds : CLSInds}
 
+- op: fusion_group
+  inputs:
+    inputs : Inputs
+  outputs:
+    outs : Outs
+
 - op: fusion_seqpool_cvm_concat
   inputs:
     {x : X, cvm : CVM}
@@ -4129,6 +4135,15 @@
   outputs:
     {out: Out}
 
+- op: lstm
+  backward: lstm_grad
+  inputs:
+    {input : Input, h0 : H0, c0 : C0, weight : Weight, bias : Bias}
+  outputs:
+    {hidden : Hidden, cell : Cell, batch_gate : BatchGate, batch_cell_pre_act : BatchCellPreAct}
+  extra:
+    outputs: [batch_gate, batch_cell_pre_act]
+
 - op: lu
   backward: lu_grad
   inputs:
@@ -4250,6 +4265,8 @@
     {x: X}
   outputs:
     {out: Out, noise: Noise}
+  extra:
+    outputs: [noise]
 
 - op: send_v2
   inputs :
@@ -4355,6 +4372,12 @@
   outputs :
     out : Out
 
+- op: transfer_layout
+  inputs:
+    x : X
+  outputs:
+    out : Out
+
 - op: uniform_random_batch_size_like
   inputs:
      input : Input
diff --git a/paddle/phi/ops/yaml/ops.yaml b/paddle/phi/ops/yaml/ops.yaml
index e758d5e0438f0..21aa2868fb8b2 100755
--- a/paddle/phi/ops/yaml/ops.yaml
+++ b/paddle/phi/ops/yaml/ops.yaml
@@ -321,6 +321,14 @@
   backward : assign_out__grad
   traits : pir::SideEffectTrait
 
+- op : assign_pos
+  args : (Tensor x, Tensor cum_count, Tensor eff_num_len)
+  output : Tensor(out)
+  infer_meta :
+    func : AssignPosInferMeta
+  kernel :
+    func : assign_pos
+
 - op : assign_value_
   args : (Tensor output, int[] shape, DataType dtype, Scalar[] values, Place place = {})
   output : Tensor(out)
@@ -760,6 +768,7 @@
   kernel :
     func : class_center_sample
     data_type : label
+  traits : pir::SideEffectTrait
 
 - op : clip
   args : (Tensor x, Scalar(float) min, Scalar(float) max)
@@ -1052,6 +1061,15 @@
     backend : place
   interfaces : paddle::dialect::InferSymbolicShapeInterface
 
+- op : decayed_adagrad
+  args : (Tensor param, Tensor grad, Tensor moment, Tensor learning_rate, float decay = 0.95f, float epsilon = 1.0e-6f)
+  output : Tensor(param_out), Tensor(moment_out)
+  infer_meta :
+    func : DecayedAdagradInferMeta
+  kernel :
+    func : decayed_adagrad
+    data_type : param
+
 - op : decode_jpeg
   args : (Tensor x, str mode, Place place)
   output : Tensor(out)
@@ -1262,6 +1280,7 @@
   optional : seed_tensor
   intermediate : mask
   backward : dropout_grad
+  traits : pir::SideEffectTrait
 
 - op : edit_distance
   args : (Tensor hyps, Tensor refs, Tensor hypslength, Tensor refslength, bool normalized = false)
@@ -1672,6 +1691,7 @@
   output : Tensor(out), Tensor(xshape)
   infer_meta :
     func : FlattenWithXShapeInferMeta
+    spmd_rule : FlattenInferSpmd
   kernel :
     func : flatten
     data_type : x
@@ -2658,6 +2678,20 @@
   backward : logsumexp_grad
   interfaces : paddle::dialect::InferSymbolicShapeInterface
 
+- op : lstm
+  args: (Tensor input, Tensor h0, Tensor c0, Tensor weight, Tensor bias, bool use_peepholes
+    = true, bool is_reverse = false, bool is_test = false, str gate_activation = "sigmoid",
+    str cell_activation = "tanh", str candidate_activation = "tanh")
+  output: Tensor (hidden), Tensor (cell), Tensor (batch_gate), Tensor (batch_cell_pre_act)
+  infer_meta:
+    func: LSTMInferMeta
+  kernel:
+    func: lstm
+    data_type: input
+  optional: h0, c0
+  intermediate: batch_gate, batch_cell_pre_act
+  backward: lstm_grad
+
 - op : lstsq
   args : (Tensor x, Tensor y, Scalar rcond=0.0f, str driver="gels")
   output : Tensor(solution), Tensor(residuals), Tensor(rank), Tensor(singular_values)
@@ -3584,7 +3618,7 @@
   traits : pir::SideEffectTrait
 
 - op : rrelu
-  args : (Tensor x, float lower, float upper, bool is_test)
+  args : (Tensor x, float lower=1.0f/8, float upper=1.0f/3, bool is_test=false)
   output : Tensor(out), Tensor(noise)
   infer_meta :
     func : RReluInferMeta
@@ -4148,6 +4182,25 @@
     func : tanh_shrink
   backward : tanh_shrink_grad
 
+- op : tdm_child
+  args: (Tensor x, Tensor tree_info, int child_nums, DataType dtype = DataType::INT32)
+  output: Tensor (child), Tensor (leaf_mask)
+  infer_meta:
+    func: TdmChildInferMeta
+  kernel:
+    func: tdm_child
+    data_type: x
+
+- op : tdm_sampler
+  args: (Tensor x, Tensor travel, Tensor layer, bool output_positive=true, int[] neg_samples_num_list={}, int[] layer_offset_lod={}, int seed = 0, int dtype=2)
+  output: Tensor(out), Tensor(labels), Tensor(mask)
+  infer_meta:
+    func : TdmSamplerInferMeta
+  kernel:
+    func : tdm_sampler
+    data_type : x
+  optional : labels
+
 - op : temporal_shift
   args : (Tensor x, int seg_num, float shift_ratio = 0.25f, str data_format = "NCHW")
   output : Tensor(out)
@@ -4374,6 +4427,7 @@
     data_type: x
   inplace: (x -> out)
   backward: uniform_inplace_grad
+  traits : pir::SideEffectTrait
 
 - op : uniform_random_batch_size_like
   args: (Tensor input, int[] shape, int input_dim_idx = 0, int output_dim_idx = 0,
@@ -4386,6 +4440,7 @@
            uniform_random_batch_size_like_sr {selected_rows -> selected_rows}
     data_type: dtype
   no_need_buffer: input
+  traits : pir::SideEffectTrait
 
 - op : unique_consecutive
   args : (Tensor x, bool return_inverse = false, bool return_counts = false, int[] axis = {}, DataType dtype = DataType::FLOAT32)
@@ -4631,3 +4686,12 @@
     func: MoeInferMeta
   kernel:
     func: moe
+
+- op: number_count
+  args: (Tensor numbers, int upper_range)
+  output: Tensor(out)
+  infer_meta:
+     func: NumberCountInferMeta
+  kernel:
+     func: number_count
+     data_type: numbers
diff --git a/paddle/pir/include/core/program.h b/paddle/pir/include/core/program.h
index d838916eefea5..4d0da62a98c84 100644
--- a/paddle/pir/include/core/program.h
+++ b/paddle/pir/include/core/program.h
@@ -57,6 +57,7 @@ class IR_API Program {
 
   std::shared_ptr<Program> Clone(IrMapping& ir_mapping) const;  // NOLINT
 
+  void CopyToBlock(IrMapping& ir_mapping, Block* insert_block) const;  // NOLINT
   Block* block() { return &module_.block(); }
   const Block* block() const { return &module_op().block(); }
 
@@ -70,9 +71,13 @@ class IR_API Program {
     parameters_ = parameters;
   }
 
+  uint64_t id() const { return id_; }
+
  private:
   // computation graph
   ModuleOp module_;
+  // unique in current process, "almost" unique between processes.
+  uint64_t id_;
   // weight
   ParameterMap parameters_;
 };
diff --git a/paddle/pir/include/dialect/shape/utils/shape_analysis.h b/paddle/pir/include/dialect/shape/utils/shape_analysis.h
index bbdda621511eb..0256d97dbc2b1 100644
--- a/paddle/pir/include/dialect/shape/utils/shape_analysis.h
+++ b/paddle/pir/include/dialect/shape/utils/shape_analysis.h
@@ -42,7 +42,7 @@ class IR_API InferSymbolicShapeContext {
 
   const symbol::ShapeOrDataDimExprs& GetShapeOrDataForValue(Value val) const;
 
-  void SetStaticShapeForValue(Value val);
+  void SetSymbolForValueByStaticShape(Value val);
 
   void SetShapeOrDataForValue(Value val,
                               const symbol::ShapeOrDataDimExprs& shape_or_data);
@@ -150,7 +150,7 @@ class IR_API ShapeConstraintIRAnalysis final
 
   friend void InferSymExprForAllValues(ModuleOp module_op);
 
-  void SetStaticShapeForValue(Value val);
+  void SetSymbolForValueByStaticShape(Value val);
 
   void InferShapeOrDataForValue(Value val);
 
diff --git a/paddle/pir/src/core/program.cc b/paddle/pir/src/core/program.cc
index 19d08f094fd4c..453cf3eb170df 100644
--- a/paddle/pir/src/core/program.cc
+++ b/paddle/pir/src/core/program.cc
@@ -13,13 +13,48 @@
 // limitations under the License.
 
 #include "paddle/pir/include/core/program.h"
+#include <limits>
+#include <mutex>
+#include <random>
+#include <unordered_set>
 #include "glog/logging.h"
 #include "paddle/pir/include/core/ir_context.h"
 
 namespace pir {
 
+namespace {
+
+int64_t GetRandomId() {
+  std::random_device rd{};
+  std::mt19937_64 gen(rd());
+  std::uniform_int_distribution<int64_t> dis(
+      0, std::numeric_limits<int64_t>::max());
+  return dis(gen);
+}
+
+bool InsertGlobalStorageSuccess(int64_t random_id) {
+  static std::unordered_set<int64_t> storage;
+  static std::mutex mutex;
+  std::unique_lock<std::mutex> lock(mutex);
+  return storage.emplace(random_id).second;
+}
+
+int64_t GetUniqueRandomId() {
+  int kLimit = 100;
+  for (int i = 0; i < kLimit; ++i) {
+    int64_t random_id = GetRandomId();
+    if (InsertGlobalStorageSuccess(random_id)) {
+      return random_id;
+    }
+  }
+  LOG(FATAL) << "Fatal bug occured in GetUniqueRandomId().";
+}
+
+}  // namespace
+
 Program::Program(IrContext* context) {
   module_ = ModuleOp::Create(context, this);
+  id_ = GetUniqueRandomId();
 }
 
 Program::~Program() {
@@ -39,6 +74,26 @@ std::shared_ptr<Program> Program::Clone(IrMapping& ir_mapping) const {
   return new_program;
 }
 
+void Program::CopyToBlock(IrMapping& ir_mapping, Block* insert_block) const {
+  auto clone_options = CloneOptions::All();
+  for (const auto& op : *block()) {
+    bool skip_op = false;
+    for (uint32_t i = 0; i < op.num_results(); i++) {
+      if (ir_mapping.GetMutableMap<pir::Value>().count(op.result(i))) {
+        skip_op = true;
+        break;
+      }
+    }
+    if (skip_op) {
+      continue;
+    }
+
+    auto* new_op = op.Clone(ir_mapping, clone_options);
+    insert_block->push_back(new_op);
+  }
+  return;
+}
+
 Parameter* Program::GetParameter(const std::string& name) const {
   if (parameters_.count(name) != 0) {
     return parameters_.at(name).get();
diff --git a/paddle/pir/src/dialect/shape/transforms/shape_optimization_pass.cc b/paddle/pir/src/dialect/shape/transforms/shape_optimization_pass.cc
index 343b1bf329c2c..e51cf34aa4bc9 100644
--- a/paddle/pir/src/dialect/shape/transforms/shape_optimization_pass.cc
+++ b/paddle/pir/src/dialect/shape/transforms/shape_optimization_pass.cc
@@ -126,6 +126,10 @@ void DebugPrintOpInfo(pir::Operation* op,
   std::ostringstream print_stream;
   for (uint32_t i = 0; i < op->num_results(); ++i) {
     const auto& res = op->result(i);
+    if (!res || !res.type()) {
+      continue;
+    }
+
     print_stream << "\tresult(" << res.dyn_cast<pir::OpResult>().index() << ") "
                  << "ShapeOrData: {";
 
@@ -170,6 +174,10 @@ void CheckInferSymWithInferMeta(
     pir::InferSymbolicShapeContext* infer_context = nullptr) {
   for (uint32_t i = 0; i < op->num_results(); ++i) {
     const auto& res = op->result(i);
+    if (!res || !res.type()) {
+      continue;
+    }
+
     std::ostringstream print_stream;
 
     // InferMeta funcs of some Ops are not corrrect now, we don't check them.
@@ -299,7 +307,7 @@ void InferSymExprForBlock(const Block& block,
                    << " DOES NOT have InferSymbolicShapeInterface!";
       }
       for (uint32_t i = 0; i < op.num_results(); ++i) {
-        infer_context->SetStaticShapeForValue(op.result(i));
+        infer_context->SetSymbolForValueByStaticShape(op.result(i));
       }
     }
     DebugPrintOpInfo(&op, infer_context);
@@ -314,6 +322,9 @@ void InferSymExprForAllValues(ModuleOp module_op) {
   auto infer_context = shape_analysis.MutInferSymbolicShapeContext();
   for (uint32_t i = 0; i < module_op->num_regions(); i++) {
     for (auto& block : module_op->region(i)) {
+      for (auto& [_, value] : block.kwargs()) {
+        infer_context->SetSymbolForValueByStaticShape(value);
+      }
       InferSymExprForBlock(block, infer_context);
     }
   }
diff --git a/paddle/pir/src/dialect/shape/utils/shape_analysis.cc b/paddle/pir/src/dialect/shape/utils/shape_analysis.cc
index b62ad0f2a3d95..d73908b0db0b4 100644
--- a/paddle/pir/src/dialect/shape/utils/shape_analysis.cc
+++ b/paddle/pir/src/dialect/shape/utils/shape_analysis.cc
@@ -67,14 +67,15 @@ InferSymbolicShapeContext::GetShapeOrDataForValue(Value val) const {
   return value_id_to_shape_or_data_.at(val.impl()->id());
 }
 
-void InferSymbolicShapeContext::SetStaticShapeForValue(Value val) {
+void InferSymbolicShapeContext::SetSymbolForValueByStaticShape(Value val) {
   const auto& value_type = val.type();
   if (!val || !value_type) {
-    PADDLE_THROW(
-        phi::errors::Fatal("Set static shape for null value is FOBBIDEN!"));
+    LOG(WARNING) << "Risk on SetSymbolForValueByStaticShape for null value";
+    return;
   }
   if (!IsStaticShape(val)) {
-    LOG(WARNING) << "Risk on SetStaticShapeForValue for contain_unknown_dim";
+    LOG(WARNING)
+        << "Risk on SetSymbolForValueByStaticShape for contain_unknown_dim";
   }
   const auto& GetStaticShapeForDenseTensorType =
       [&](DenseTensorType type_info) -> symbol::TensorShapeOrDataDimExprs {
@@ -289,8 +290,8 @@ const std::string ShapeConstraintIRAnalysis::GetNextSymName() {
   return context_.GetNextSymName();
 }
 
-void ShapeConstraintIRAnalysis::SetStaticShapeForValue(Value val) {
-  context_.SetStaticShapeForValue(val);
+void ShapeConstraintIRAnalysis::SetSymbolForValueByStaticShape(Value val) {
+  context_.SetSymbolForValueByStaticShape(val);
 }
 
 void ShapeConstraintIRAnalysis::InferShapeOrDataForValue(Value val) {
@@ -319,7 +320,7 @@ void ShapeConstraintIRAnalysis::InferShapeOrDataForValue(Value val) {
         for (auto& operand : GetRealOperandSource(op)) {
           if (operand.impl() && !context_.HasShapeOrDataForValue(operand)) {
             if (!operand.defining_op()) {
-              SetStaticShapeForValue(operand);
+              SetSymbolForValueByStaticShape(operand);
             } else {
               Visit(operand.defining_op());
             }
@@ -334,7 +335,7 @@ void ShapeConstraintIRAnalysis::InferShapeOrDataForValue(Value val) {
     for (auto& operand : GetRealOperandSource(op)) {
       if (operand.impl() && !context_.HasShapeOrDataForValue(operand)) {
         if (!operand.defining_op()) {
-          SetStaticShapeForValue(operand);
+          SetSymbolForValueByStaticShape(operand);
         } else {
           has_prev_op = true;
         }
@@ -394,7 +395,7 @@ void ShapeConstraintIRAnalysis::InferShapeOrDataForValue(Value val) {
                    << " DOES NOT have InferSymbolicShapeInterface!";
       for (auto& result_value : op->results()) {
         if (result_value && (!context_.HasShapeOrDataForValue(result_value))) {
-          SetStaticShapeForValue(result_value);
+          SetSymbolForValueByStaticShape(result_value);
         }
       }
     }
@@ -412,7 +413,7 @@ ShapeConstraintIRAnalysis::GetShapeOrDataForValue(Value val) {
   if (!context_.HasShapeOrDataForValue(val)) {
     // backtrack to infer shape from defining op
     if (!val.defining_op()) {
-      SetStaticShapeForValue(val);
+      SetSymbolForValueByStaticShape(val);
     } else {
       VLOG(3) << "InferShapeOrDataForValue,  defining_op: "
               << val.defining_op()->name();
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 45b796671852e..7fcb1898bbe62 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -94,6 +94,7 @@ if not defined retry_times set retry_times=1
 if not defined PYTHON_ROOT set PYTHON_ROOT=C:\Python38
 if not defined BUILD_DIR set BUILD_DIR=build
 if not defined TEST_INFERENCE set TEST_INFERENCE=ON
+if not defined WITH_PIP_CUDA_LIBRARIES set WITH_PIP_CUDA_LIBRARIES=OFF
 
 set task_name=%1
 set UPLOAD_TP_FILE=OFF
@@ -301,6 +302,7 @@ rem ------Build windows avx whl package------
 :CASE_build_avx_whl
 set WITH_AVX=ON
 set ON_INFER=ON
+set WITH_PIP_CUDA_LIBRARIES=ON
 if not defined CUDA_ARCH_NAME set CUDA_ARCH_NAME=All
 
 call :cmake || goto cmake_error
@@ -515,7 +517,7 @@ echo cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -D
 -DWITH_UNITY_BUILD=%WITH_UNITY_BUILD% -DCUDA_ARCH_NAME=%CUDA_ARCH_NAME% -DCUDA_ARCH_BIN=%CUDA_ARCH_BIN% -DCUB_PATH=%THIRD_PARTY_HOME%/cub ^
 -DCUDA_TOOLKIT_ROOT_DIR="%CUDA_TOOLKIT_ROOT_DIR%" -DNEW_RELEASE_ALL=%NEW_RELEASE_ALL% -DNEW_RELEASE_PYPI=%NEW_RELEASE_PYPI% ^
 -DNEW_RELEASE_JIT=%NEW_RELEASE_JIT% -DWITH_ONNXRUNTIME=%WITH_ONNXRUNTIME% -DWITH_CPP_TEST=%WITH_CPP_TEST% ^
--DWIN_UNITTEST_LEVEL=%WIN_UNITTEST_LEVEL% -DWITH_NIGHTLY_BUILD=%WITH_NIGHTLY_BUILD%
+-DWIN_UNITTEST_LEVEL=%WIN_UNITTEST_LEVEL% -DWITH_NIGHTLY_BUILD=%WITH_NIGHTLY_BUILD% -DWITH_PIP_CUDA_LIBRARIES=%WITH_PIP_CUDA_LIBRARIES% >> %work_dir%\win_cmake.sh
 
 echo cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
 -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DPYTHON_EXECUTABLE=%PYTHON_EXECUTABLE% -DON_INFER=%ON_INFER% ^
@@ -525,7 +527,7 @@ echo cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -D
 -DWITH_UNITY_BUILD=%WITH_UNITY_BUILD% -DCUDA_ARCH_NAME=%CUDA_ARCH_NAME% -DCUDA_ARCH_BIN=%CUDA_ARCH_BIN% -DCUB_PATH=%THIRD_PARTY_HOME%/cub ^
 -DCUDA_TOOLKIT_ROOT_DIR="%CUDA_TOOLKIT_ROOT_DIR%" -DNEW_RELEASE_ALL=%NEW_RELEASE_ALL% -DNEW_RELEASE_PYPI=%NEW_RELEASE_PYPI% ^
 -DNEW_RELEASE_JIT=%NEW_RELEASE_JIT% -DWITH_ONNXRUNTIME=%WITH_ONNXRUNTIME% -DWITH_CPP_TEST=%WITH_CPP_TEST% ^
--DWIN_UNITTEST_LEVEL=%WIN_UNITTEST_LEVEL% -DWITH_NIGHTLY_BUILD=%WITH_NIGHTLY_BUILD% >> %work_dir%\win_cmake.sh
+-DWIN_UNITTEST_LEVEL=%WIN_UNITTEST_LEVEL% -DWITH_NIGHTLY_BUILD=%WITH_NIGHTLY_BUILD% -DWITH_PIP_CUDA_LIBRARIES=%WITH_PIP_CUDA_LIBRARIES% >> %work_dir%\win_cmake.sh
 
 cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
 -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DPYTHON_EXECUTABLE=%PYTHON_EXECUTABLE% -DON_INFER=%ON_INFER% ^
@@ -535,7 +537,7 @@ cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_
 -DWITH_UNITY_BUILD=%WITH_UNITY_BUILD% -DCUDA_ARCH_NAME=%CUDA_ARCH_NAME% -DCUDA_ARCH_BIN=%CUDA_ARCH_BIN% -DCUB_PATH=%THIRD_PARTY_HOME%/cub ^
 -DCUDA_TOOLKIT_ROOT_DIR="%CUDA_TOOLKIT_ROOT_DIR%" -DNEW_RELEASE_ALL=%NEW_RELEASE_ALL% -DNEW_RELEASE_PYPI=%NEW_RELEASE_PYPI% ^
 -DNEW_RELEASE_JIT=%NEW_RELEASE_JIT% -DWITH_ONNXRUNTIME=%WITH_ONNXRUNTIME% -DWITH_CPP_TEST=%WITH_CPP_TEST% ^
--DWIN_UNITTEST_LEVEL=%WIN_UNITTEST_LEVEL% -DWITH_NIGHTLY_BUILD=%WITH_NIGHTLY_BUILD%
+-DWIN_UNITTEST_LEVEL=%WIN_UNITTEST_LEVEL% -DWITH_NIGHTLY_BUILD=%WITH_NIGHTLY_BUILD% -DWITH_PIP_CUDA_LIBRARIES=%WITH_PIP_CUDA_LIBRARIES%
 goto:eof
 
 :cmake_error
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 8c0266c36e8c1..e793c210628be 100644
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -1130,7 +1130,10 @@ function check_whl_size() {
 
 function generate_upstream_develop_api_spec() {
     set -x
+    # Temporarily save some scripts from PR branch
     cp ${PADDLE_ROOT}/python/requirements.txt /tmp
+    cp ${PADDLE_ROOT}/tools/print_signatures.py /tmp
+
     mkdir -p ${PADDLE_ROOT}/build/pr_whl && mv ${PADDLE_ROOT}/build/python/dist/*.whl ${PADDLE_ROOT}/build/pr_whl/
     pr_whl_size=`du -m ${PADDLE_ROOT}/build/python/dist/*.whl|awk '{print $1}'`
     echo "pr_whl_size: ${pr_whl_size}"
@@ -1178,17 +1181,20 @@ function generate_api_spec() {
         echo "Not supported $2"
         exit 1
     fi
+    if [ "$spec_kind" == "DEV" ]; then
+        REQUIREMENTS_PATH=/tmp/requirements.txt
+        PRINT_SIGNATURES_SCRIPT_PATH=/tmp/print_signatures.py
+    else
+        REQUIREMENTS_PATH=${PADDLE_ROOT}/python/requirements.txt
+        PRINT_SIGNATURES_SCRIPT_PATH=${PADDLE_ROOT}/tools/print_signatures.py
+    fi
 
     mkdir -p ${PADDLE_ROOT}/build/.check_api_workspace
     cd ${PADDLE_ROOT}/build/.check_api_workspace
     virtualenv -p `which python` .${spec_kind}_env
     source .${spec_kind}_env/bin/activate
+    pip install -r $REQUIREMENTS_PATH
 
-    if [ "$spec_kind" == "DEV" ]; then
-        pip install -r /tmp/requirements.txt
-    else
-        pip install -r ${PADDLE_ROOT}/python/requirements.txt
-    fi
     if [ -d "${PADDLE_ROOT}/build/python/dist/" ]; then
         pip install ${PADDLE_ROOT}/build/python/dist/*whl
     elif [ -d "${PADDLE_ROOT}/dist/" ];then
@@ -1196,7 +1202,10 @@ function generate_api_spec() {
         mkdir ${PADDLE_ROOT}/build/python/dist/ && mv  ${PADDLE_ROOT}/dist/*whl  ${PADDLE_ROOT}/build/python/dist/
     fi
     spec_path=${PADDLE_ROOT}/paddle/fluid/API_${spec_kind}.spec
-    python ${PADDLE_ROOT}/tools/print_signatures.py paddle > $spec_path
+    python ${PRINT_SIGNATURES_SCRIPT_PATH} paddle > $spec_path
+    python ${PRINT_SIGNATURES_SCRIPT_PATH} --show-fields="args,varargs,varkw,defaults,kwonlyargs,kwonlydefaults" paddle > ${spec_path}.api
+    python ${PRINT_SIGNATURES_SCRIPT_PATH} --show-fields="annotations" paddle > ${spec_path}.annotations
+    python ${PRINT_SIGNATURES_SCRIPT_PATH} --show-fields="document" paddle > ${spec_path}.doc
 
     # used to log op_register data_type
     op_type_path=${PADDLE_ROOT}/paddle/fluid/OP_TYPE_${spec_kind}.spec
@@ -1214,9 +1223,6 @@ function generate_api_spec() {
     api_source_md5_path=${PADDLE_ROOT}/paddle/fluid/API_${spec_kind}.source.md5
     python ${PADDLE_ROOT}/tools/count_api_without_core_ops.py -p paddle > $api_source_md5_path
 
-    awk -F '(' '{print $NF}' $spec_path >${spec_path}.doc
-    awk -F '(' '{$NF="";print $0}' $spec_path >${spec_path}.api
-
     python ${PADDLE_ROOT}/tools/diff_use_default_grad_op_maker.py \
         ${PADDLE_ROOT}/paddle/fluid/op_use_default_grad_maker_${spec_kind}.spec
 
@@ -1474,7 +1480,7 @@ function card_test() {
     if [ "${WITH_XPU}" == "ON" ];then
         CUDA_DEVICE_COUNT=1
     elif [ "${WITH_ROCM}" == "ON" ];then
-        CUDA_DEVICE_COUNT=$(rocm-smi -i | grep GPU | wc -l)
+        CUDA_DEVICE_COUNT=$(rocm-smi -i | grep DCU | wc -l)
     elif [ "${WITH_IPU}" == "ON" ];then
         CUDA_DEVICE_COUNT=1
     else
@@ -1517,13 +1523,22 @@ function card_test() {
             if [[ $cardnumber == $CUDA_DEVICE_COUNT ]]; then
                 (ctest -I $i,,$NUM_PROC -R "($testcases)" -E "($disable_ut_quickly)" ${run_label_mode} -V --timeout 120 -j $parallel_job | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) &
             else
-                (env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC -R "($testcases)" -E "($disable_ut_quickly)" ${run_label_mode} --timeout 120 -V -j $parallel_job | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) &
+                if [ "$WITH_ROCM" == "ON" ];then
+                    (env HIP_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC -R "($testcases)" -E "($disable_ut_quickly)" ${run_label_mode} --timeout 120 -V -j $parallel_job | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) &
+                else
+                    (env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC -R "($testcases)" -E "($disable_ut_quickly)" ${run_label_mode} --timeout 120 -V -j $parallel_job | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) &
+                fi
             fi
         else
             if [[ $cardnumber == $CUDA_DEVICE_COUNT ]]; then
                 (ctest -I $i,,$NUM_PROC -R "($testcases)" -E "($disable_ut_quickly)" ${run_label_mode} --timeout 120 --output-on-failure  -j $parallel_job | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) &
             else
-                (env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC -R "($testcases)" -E "($disable_ut_quickly)" ${run_label_mode} --timeout 120 --output-on-failure  -j $parallel_job | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) &
+                if [ "$WITH_ROCM" == "ON" ];then
+                    (env HIP_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC -R "($testcases)" -E "($disable_ut_quickly)" ${run_label_mode} --timeout 120 --output-on-failure  -j $parallel_job | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) &
+                else
+                    (env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC -R "($testcases)" -E "($disable_ut_quickly)" ${run_label_mode} --timeout 120 --output-on-failure  -j $parallel_job | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) &
+                fi
+                
             fi
         fi
     done
@@ -2652,7 +2667,11 @@ set -x
         fi
         if [ -a "$PADDLE_ROOT/added_ut" ];then
             added_uts=^$(awk BEGIN{RS=EOF}'{gsub(/\n/,"$|^");print}' $PADDLE_ROOT/added_ut)$
-            env CUDA_VISIBLE_DEVICES=0 ctest -R "(${added_uts})" -LE "RUN_TYPE=DIST|RUN_TYPE=EXCLUSIVE|RUN_TYPE=HYBRID" --output-on-failure --repeat-until-fail 3 --timeout 15;added_ut_error=$?
+            if [ "$WITH_ROCM" == "ON" ];then
+                env HIP_VISIBLE_DEVICES=0 ctest -R "(${added_uts})" -LE "RUN_TYPE=DIST|RUN_TYPE=EXCLUSIVE|RUN_TYPE=HYBRID" --output-on-failure --repeat-until-fail 3 --timeout 15;added_ut_error=$?
+            else
+                env CUDA_VISIBLE_DEVICES=0 ctest -R "(${added_uts})" -LE "RUN_TYPE=DIST|RUN_TYPE=EXCLUSIVE|RUN_TYPE=HYBRID" --output-on-failure --repeat-until-fail 3 --timeout 15;added_ut_error=$?
+            fi
             ctest -R "(${added_uts})" -L "RUN_TYPE=DIST|RUN_TYPE=EXCLUSIVE" --output-on-failure --repeat-until-fail 3 --timeout 15;added_ut_error_1=$?
             if [ "$added_ut_error" != 0 ] && [ "$added_ut_error_1" != 0 ];then
                 echo "========================================"
@@ -2826,7 +2845,9 @@ set +x
         rerun_ut_endTime_s=`date +%s`
         echo "ipipe_log_param_Rerun_TestCases_Total_Time: $[ $rerun_ut_endTime_s - $rerun_ut_startTime_s ]s"
         echo "ipipe_log_param_Rerun_TestCases_Total_Time: $[ $rerun_ut_endTime_s - $rerun_ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
-        cp $PADDLE_ROOT/build/Testing/Temporary/CTestCostData.txt ${cfs_dir}/coverage/${AGILE_PULL_ID}/${AGILE_REVISION}/
+        if [ "$WITH_ROCM" != "ON" ];then
+            cp $PADDLE_ROOT/build/Testing/Temporary/CTestCostData.txt ${cfs_dir}/coverage/${AGILE_PULL_ID}/${AGILE_REVISION}/
+        fi
         if [[ "$EXIT_CODE" != "0" ]]; then
             show_ut_retry_result
         fi
@@ -3488,7 +3509,6 @@ function build_document_preview() {
     sh /paddle/tools/document_preview.sh ${PORT}
 }
 
-
 # origin name: example
 function exec_samplecode_test() {
     if [ -d "${PADDLE_ROOT}/build/pr_whl" ];then
@@ -3502,10 +3522,10 @@ function exec_samplecode_test() {
 
     cd ${PADDLE_ROOT}/tools
     if [ "$1" = "cpu" ] ; then
-        python sampcd_processor.py --debug --mode cpu; example_error=$?
+        python sampcd_processor.py --mode cpu; example_error=$?
     elif [ "$1" = "gpu" ] ; then
         SAMPLE_CODE_EXEC_THREADS=${SAMPLE_CODE_EXEC_THREADS:-2}
-        python sampcd_processor.py --threads=${SAMPLE_CODE_EXEC_THREADS} --debug --mode gpu; example_error=$?
+        python sampcd_processor.py --threads=${SAMPLE_CODE_EXEC_THREADS} --mode gpu; example_error=$?
     fi
     if [ "$example_error" != "0" ];then
       echo "Code instance execution failed" >&2
@@ -3513,6 +3533,75 @@ function exec_samplecode_test() {
     fi
 }
 
+function need_type_checking() {
+    set +x
+
+    # check pr title
+    TITLE_CHECK=`curl -s https://github.com/PaddlePaddle/Paddle/pull/${GIT_PR_ID} | grep "<title>" | grep -i "typing" || true`
+
+    if [[ ${TITLE_CHECK} ]]; then
+        set -x
+        return 0
+    else
+        set -x
+        return 1
+    fi
+}
+
+function exec_type_checking() {
+    if [ -d "${PADDLE_ROOT}/build/pr_whl" ];then
+        pip install ${PADDLE_ROOT}/build/pr_whl/*.whl
+    else
+        echo "WARNING: PR wheel is not found. Use develop wheel !!!"
+        pip install ${PADDLE_ROOT}/build/python/dist/*.whl
+    fi
+
+    python -c "import paddle;print(paddle.__version__);paddle.version.show()"
+
+    cd ${PADDLE_ROOT}/tools
+    
+    # check all sample code
+    TITLE_CHECK_ALL=`curl -s https://github.com/PaddlePaddle/Paddle/pull/${GIT_PR_ID} | grep "<title>" | grep -i "typing all" || true`
+
+    if [[ ${TITLE_CHECK_ALL} ]]; then
+        python type_checking.py --full-test; type_checking_error=$?
+    else
+        python type_checking.py; type_checking_error=$?
+    fi
+
+    if [ "$type_checking_error" != "0" ];then
+      echo "Example code type checking failed" >&2
+      exit 5
+    fi
+}
+
+
+function exec_samplecode_checking() {
+    example_info_gpu=""
+    example_code_gpu=0
+    if [ "${WITH_GPU}" == "ON" ] ; then
+        { example_info_gpu=$(exec_samplecode_test gpu 2>&1 1>&3 3>/dev/null); } 3>&1
+        example_code_gpu=$?
+    fi
+    { example_info=$(exec_samplecode_test cpu 2>&1 1>&3 3>/dev/null); } 3>&1
+    example_code=$?
+
+    # TODO(megemini): type_checkding should be default after type annotation been done.
+    need_type_checking
+    type_checking_status=$?
+
+    if [[ ${type_checking_status} -eq 0 ]]; then
+        { type_checking_info=$(exec_type_checking 2>&1 1>&3 3>/dev/null); } 3>&1
+        type_checking_code=$?
+    fi
+
+    summary_check_example_code_problems $[${example_code_gpu} + ${example_code}] "${example_info_gpu}\n${example_info}"
+
+    if [[ ${type_checking_status} -eq 0 ]]; then
+        summary_type_checking_problems $type_checking_code "$type_checking_info"
+    fi
+}
+
 
 function collect_ccache_hits() {
     ccache -s
@@ -3553,10 +3642,11 @@ function test_model_benchmark() {
     bash ${PADDLE_ROOT}/tools/test_model_benchmark.sh
 }
 
-function summary_check_problems() {
+function summary_check_example_code_problems() {
     set +x
     local example_code=$1
     local example_info=$2
+
     if [ $example_code -ne 0 ];then
         echo "==============================================================================="
         echo "*****Example code error***** Please fix the error listed in the information:"
@@ -3579,6 +3669,33 @@ function summary_check_problems() {
 }
 
 
+function summary_type_checking_problems() {
+    set +x
+    local type_checking_code=$1
+    local type_checking_info=$2
+
+    if [ $type_checking_code -ne 0 ];then
+        echo "==============================================================================="
+        echo "*****Example code type checking error***** Please fix the error listed in the information:"
+        echo "==============================================================================="
+        echo "$type_checking_info"
+        echo "==============================================================================="
+        echo "*****Example code type checking FAIL*****"
+        echo "==============================================================================="
+        exit $type_checking_code
+    else
+        echo "==============================================================================="
+        echo "*****Example code type checking info*****"
+        echo "==============================================================================="
+        echo "$type_checking_info"
+        echo "==============================================================================="
+        echo "*****Example code type checking PASS*****"
+        echo "==============================================================================="
+    fi
+    set -x
+}
+
+
 function reuse_so_cache() {
     get_html="https://api.github.com/repos/PaddlePaddle/Paddle"
     curl -X GET ${get_html}/commits -H "authorization: token ${GITHUB_API_TOKEN}" >tmp.txt
@@ -3631,7 +3748,10 @@ function build_pr_and_develop() {
     fi
     mv ${PADDLE_ROOT}/dist/*.whl ${PADDLE_ROOT}/build/python/dist/
     cmake_change=`git diff --name-only upstream/$BRANCH | grep "cmake/external" || true`
+    # Temporarily save some scripts from PR branch
     cp ${PADDLE_ROOT}/python/requirements.txt /tmp
+    cp ${PADDLE_ROOT}/tools/print_signatures.py /tmp
+
     generate_api_spec "$1" "PR"
     mkdir ${PADDLE_ROOT}/build/pr_whl && cp ${PADDLE_ROOT}/build/python/dist/*.whl ${PADDLE_ROOT}/build/pr_whl
     rm -f ${PADDLE_ROOT}/build/python/dist/*.whl && rm -f ${PADDLE_ROOT}/build/python/build/.timestamp
@@ -4262,15 +4382,7 @@ function main() {
         check_sequence_op_unittest
         generate_api_spec ${PYTHON_ABI:-""} "PR"
         set +e
-        example_info_gpu=""
-        example_code_gpu=0
-        if [ "${WITH_GPU}" == "ON" ] ; then
-            { example_info_gpu=$(exec_samplecode_test gpu 2>&1 1>&3 3>/dev/null); } 3>&1
-            example_code_gpu=$?
-        fi
-        { example_info=$(exec_samplecode_test cpu 2>&1 1>&3 3>/dev/null); } 3>&1
-        example_code=$?
-        summary_check_problems $[${example_code_gpu} + ${example_code}] "${example_info_gpu}\n${example_info}"
+        exec_samplecode_checking
         assert_api_spec_approvals
         ;;
       build_and_check_cpu)
@@ -4282,15 +4394,7 @@ function main() {
         ;;
       build_and_check_gpu)
         set +e
-        example_info_gpu=""
-        example_code_gpu=0
-        if [ "${WITH_GPU}" == "ON" ] ; then
-            { example_info_gpu=$(exec_samplecode_test gpu 2>&1 1>&3 3>/dev/null); } 3>&1
-            example_code_gpu=$?
-        fi
-        { example_info=$(exec_samplecode_test cpu 2>&1 1>&3 3>/dev/null); } 3>&1
-        example_code=$?
-        summary_check_problems $[${example_code_gpu} + ${example_code}] "${example_info_gpu}\n${example_info}"
+        exec_samplecode_checking
         assert_api_spec_approvals
         ;;
       check_whl_size)
@@ -4395,6 +4499,9 @@ function main() {
         export FLAGS_PIR_OPTEST=True
         parallel_test true
         ;;
+      hyg_dcu_test)
+        parallel_test
+        ;;
       nv_cicheck_coverage)
         parallel_test
         nv_test
@@ -4416,10 +4523,6 @@ function main() {
         build ${parallel_number}
         run_brpc_test
         ;;
-      assert_api)
-        generate_upstream_develop_api_spec ${PYTHON_ABI:-""} ${parallel_number}
-        assert_api_spec_approvals
-        ;;
       test_inference)
         PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../../" && pwd )"
         if [ "${WITH_PYTHON}" == "OFF" ] ; then
@@ -4449,9 +4552,6 @@ function main() {
         gen_fluid_lib ${parallel_number}
         test_fluid_lib_train
         ;;
-      assert_api_approvals)
-        assert_api_spec_approvals
-        ;;
       assert_file_approvals)
         assert_file_diff_approvals
         ;;
@@ -4533,11 +4633,6 @@ function main() {
         build ${parallel_number}
         build_document_preview
         ;;
-      api_example)
-        { example_info=$(exec_samplecode_test cpu 2>&1 1>&3 3>/dev/null); } 3>&1
-        example_code=$?
-        summary_check_problems $example_code "$example_info"
-        ;;
       test_op_benchmark)
         test_op_benchmark
         ;;
diff --git a/paddle/scripts/windows_build/build.bat b/paddle/scripts/windows_build/build.bat
index 0aeacfef7f9bd..4ffec08e666e2 100644
--- a/paddle/scripts/windows_build/build.bat
+++ b/paddle/scripts/windows_build/build.bat
@@ -1,5 +1,5 @@
 @ECHO OFF
-SETLOCAL 
+SETLOCAL
 set source_path=%1
 set PYTHON_DIR=%2
 set WITH_GPU=%3
diff --git a/pyproject.toml b/pyproject.toml
index 4a4a5a73c5fda..0391f1bf823f4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -131,3 +131,32 @@ known-first-party = ["paddle"]
 "test/dygraph_to_static/test_loop.py" = ["C416", "F821"]
 # Ignore unnecessary lambda in dy2st unittest test_lambda
 "test/dygraph_to_static/test_lambda.py" = ["PLC3002"]
+
+[tool.mypy]
+python_version = "3.8"
+cache_dir = ".mypy_cache"
+# Miscellaneous strictness flags
+allow_redefinition = true
+local_partial_types = true
+strict = false
+# Untyped definitions and calls
+check_untyped_defs = true
+# Import discovery
+follow_imports = "normal"
+# Miscellaneous
+warn_unused_configs = true
+# Configuring warnings
+warn_redundant_casts = true
+warn_unused_ignores = true
+warn_no_return = true
+# Configuring error messages
+show_column_numbers = true
+
+[[tool.mypy.overrides]]
+module = [
+    "astor",
+    "cv2",
+    "scipy",
+    "xlsxwriter"
+]
+ignore_missing_imports = true
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index b3029a24309cf..16501a254f280 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -25,7 +25,7 @@ set(PY_FILES paddle/__init__.py ${UTILS_PY_FILES} ${FLUID_PY_FILES})
 if(WITH_GPU)
   set(PACKAGE_NAME "paddlepaddle-gpu")
 elseif(WITH_ROCM)
-  set(PACKAGE_NAME "paddlepaddle-rocm")
+  set(PACKAGE_NAME "paddlepaddle-dcu")
 elseif(WITH_XPU)
   set(PACKAGE_NAME "paddlepaddle-xpu")
 elseif(WITH_IPU)
@@ -173,17 +173,10 @@ endif()
 
 add_custom_target(paddle_python ALL
                   DEPENDS ${PADDLE_PYTHON_BUILD_DIR}/.timestamp)
+
 if(BUILD_WHL_PACKAGE AND NOT WITH_SETUP_INSTALL)
-  add_custom_target(
-    paddle_copy ALL
-    # generate tensor.pyi for type hints
-    COMMAND
-      ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python
-      ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/tools/gen_tensor_stub.py
-      --input-file
-      ${PADDLE_SOURCE_DIR}/python/paddle/tensor/tensor.prototype.pyi
-      --output-file ${PADDLE_BINARY_DIR}/python/paddle/tensor/tensor.pyi
-    DEPENDS ${PADDLE_PYTHON_BUILD_DIR}/.timestamp_wheel)
+  add_custom_target(paddle_copy ALL
+                    DEPENDS ${PADDLE_PYTHON_BUILD_DIR}/.timestamp_wheel)
   add_dependencies(paddle_copy paddle_python)
 endif()
 
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 0cd36f299ecd6..37409b626009b 100644
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -14,12 +14,16 @@
 
 import typing
 
+__is_metainfo_generated = False
 try:
     from paddle.cuda_env import *  # noqa: F403
     from paddle.version import (  # noqa: F401
         commit as __git_commit__,
         full_version as __version__,
     )
+
+    __is_metainfo_generated = True
+
 except ImportError:
     import sys
 
@@ -272,6 +276,7 @@
     atleast_1d,
     atleast_2d,
     atleast_3d,
+    block_diag,
     broadcast_tensors,
     broadcast_to,
     cast,
@@ -433,6 +438,7 @@
     inner,
     inverse,
     isfinite,
+    isin,
     isinf,
     isnan,
     isneginf,
@@ -577,8 +583,7 @@
     if os.path.exists(cuh_file):
         os.environ.setdefault('runtime_include_dir', runtime_include_dir)
 
-
-if is_compiled_with_cuda():
+if __is_metainfo_generated and is_compiled_with_cuda():
     import os
     import platform
 
@@ -679,7 +684,9 @@
                 ctypes.CDLL('msvcp140.dll')
                 ctypes.CDLL('vcruntime140_1.dll')
             except OSError:
-                print(
+                import logging
+
+                logging.error(
                     '''Microsoft Visual C++ Redistributable is not installed, this may lead to the DLL load failure.
                         It can be downloaded at https://aka.ms/vs/16/release/vc_redist.x64.exe'''
                 )
@@ -699,7 +706,6 @@
             path_patched = False
             for dll in dlls:
                 is_loaded = False
-                print("dll:", dll)
                 if with_load_library_flags:
                     res = kernel32.LoadLibraryExW(dll, None, 0x00001100)
                     last_error = ctypes.get_last_error()
@@ -733,6 +739,7 @@
 ir_guard._switch_to_pir()
 
 __all__ = [
+    'block_diag',
     'iinfo',
     'finfo',
     'dtype',
@@ -846,6 +853,7 @@
     'squeeze_',
     'to_tensor',
     'gather_nd',
+    'isin',
     'isinf',
     'isneginf',
     'isposinf',
diff --git a/python/paddle/amp/auto_cast.py b/python/paddle/amp/auto_cast.py
index 9ae60e5185ee0..34318f3cc9183 100644
--- a/python/paddle/amp/auto_cast.py
+++ b/python/paddle/amp/auto_cast.py
@@ -251,7 +251,7 @@ def _pir_transform(t, dtype):
                 param = op.operand(0).source()
                 cast_param = paddle.cast(param, dtype)
                 cast_param.persistable = True
-                paddle._pir_ops.updata_parameter(cast_param, t.name)
+                paddle._pir_ops.update_parameter(cast_param, t.name)
                 block.remove_op(op)
                 break
     main.set_parameters_from(startup)
diff --git a/python/paddle/autograd/backward_utils.py b/python/paddle/autograd/backward_utils.py
index 0649c3e19bf05..49863ec16363a 100644
--- a/python/paddle/autograd/backward_utils.py
+++ b/python/paddle/autograd/backward_utils.py
@@ -29,20 +29,21 @@
 
 # TODO: Consider a better way to mark these ops has no grad op.
 # Such as use a new trait to mark these ops.
+# Please keep them as alphabetical order.
 ALLOW_NO_GRAD_OPS = [
     # Compare ops
     "pd_op.equal",
     "pd_op.equal_",
-    "pd_op.not_equal",
-    "pd_op.not_equal_",
-    "pd_op.less_than",
-    "pd_op.less_than_",
-    "pd_op.less_equal",
-    "pd_op.less_equal_",
     "pd_op.greater_than",
     "pd_op.greater_than_",
     "pd_op.greater_equal",
     "pd_op.greater_equal_",
+    "pd_op.less_than",
+    "pd_op.less_than_",
+    "pd_op.less_equal",
+    "pd_op.less_equal_",
+    "pd_op.not_equal",
+    "pd_op.not_equal_",
     # Logical ops
     "pd_op.logical_and",
     "pd_op.logical_and_",
@@ -67,35 +68,39 @@
     "pd_op.bitwise_xor_",
     # Array ops
     "pd_op.assign_array",
-    "pd_op.array_length",
-    "pd_op.slice_array",
-    "pd_op.slice_array_dense",
-    "pd_op.assign_array",
     "pd_op.assign_array_",
-    "pd_op.create_array",
-    "pd_op.create_array_like",
+    "pd_op.array_length",
+    "pd_op.array_pop",
     "pd_op.array_read",
     "pd_op.array_write_",
-    "pd_op.array_pop",
+    "pd_op.create_array",
+    "pd_op.create_array_like",
+    "pd_op.slice_array",
+    "pd_op.slice_array_dense",
     # Others
-    "pd_op.remainder",
-    "pd_op.argmax",
-    "pd_op.print",
     "pd_op.accuracy",
-    "pd_op.randint",
-    "pd_op.uniform",
-    "pd_op.gaussian",
+    "pd_op.all",
+    "pd_op.any",
+    "pd_op.argmax",
+    "pd_op.assign_value_",
     "pd_op.bernoulli",
+    "pd_op.distribute_fpn_proposals",
+    "pd_op.floor_divide",
     "pd_op.full_like",
-    "pd_op.assign_value_",
-    "pd_op.nextafter",
+    "pd_op.full_with_tensor",
+    "pd_op.gaussian",
     "pd_op.isnan",
     "pd_op.isinf",
-    "pd_op.all",
-    "pd_op.any",
+    "pd_op.nextafter",
+    "pd_op.nonzero",
+    "pd_op.one_hot",
+    "pd_op.print",
     "pd_op.prior_box",
+    "pd_op.randint",
+    "pd_op.remainder",
+    "pd_op.shape",
     "pd_op.share_data_",
-    "pd_op.floor_divide",
+    "pd_op.uniform",
 ]
 
 
diff --git a/python/paddle/base/core.py b/python/paddle/base/core.py
index 4b00161bc3c82..f412a954c0bb0 100644
--- a/python/paddle/base/core.py
+++ b/python/paddle/base/core.py
@@ -506,7 +506,11 @@ def _test_use_sync(value):
 
 
 # ops in forward_blacklist will not be replaced by composite ops.
-prim_config = {"forward_blacklist": set(), "composite_ops_record": set()}
+prim_config = {
+    "forward_blacklist": set(),
+    "composite_ops_record": set(),
+    "backward_blacklist": set(),
+}
 
 
 def _get_batch_norm_none_var(op):
@@ -588,6 +592,7 @@ def _reset_prim_forward_blacklist():
 def _set_prim_backward_blacklist(*args):
     ops = set(args)
     for item in ops:
+        prim_config["backward_blacklist"].add(item)
         if not isinstance(item, str):
             raise TypeError("all items in set must belong to string")
     _set_bwd_prim_blacklist(ops)
@@ -671,3 +676,15 @@ def _check_and_set_prim_vjp_skip_default_ops():
 
 
 _check_and_set_prim_vjp_skip_default_ops()
+
+
+def _check_prim_vjp_ops():
+    ops_org = os.getenv("FLAGS_prim_backward_blacklist", "")
+    if ops_org:
+        ops = []
+        for item in ops_org.split(";"):
+            ops.append(item.strip())
+        _set_prim_backward_blacklist(*ops)
+
+
+_check_prim_vjp_ops()
diff --git a/python/paddle/base/framework.py b/python/paddle/base/framework.py
index fcd69d0fd65d1..4b62b57f4e806 100644
--- a/python/paddle/base/framework.py
+++ b/python/paddle/base/framework.py
@@ -33,7 +33,6 @@
 import numpy as np
 
 import paddle
-import paddle.version as paddle_version
 
 from .. import pir
 from . import core, unique_name
@@ -573,10 +572,10 @@ def require_version(min_version, max_version=None):
             )
 
     version_installed = [
-        paddle_version.major,
-        paddle_version.minor,
-        paddle_version.patch,
-        paddle_version.rc,
+        paddle.version.major,
+        paddle.version.minor,
+        paddle.version.patch,
+        paddle.version.rc,
     ]
     zero_version = ["0", "0", "0", "0"]
 
@@ -591,13 +590,13 @@ def version_cmp(ver_a, ver_b):
     if version_cmp(version_installed, zero_version) == 0:
         if max_version is not None:
             warnings.warn(
-                f"PaddlePaddle version in [{min_version}, {max_version}] required, but {paddle_version.full_version} installed. "
+                f"PaddlePaddle version in [{min_version}, {max_version}] required, but {paddle.version.full_version} installed. "
                 "Maybe you are using a develop version, "
                 "please make sure the version is good with your code."
             )
         else:
             warnings.warn(
-                f"PaddlePaddle version {min_version} or higher is required, but {paddle_version.full_version} installed, "
+                f"PaddlePaddle version {min_version} or higher is required, but {paddle.version.full_version} installed, "
                 "Maybe you are using a develop version, "
                 "please make sure the version is good with your code."
             )
@@ -619,12 +618,12 @@ def version_cmp(ver_a, ver_b):
             or version_cmp(version_installed, min_version_to_check) < 0
         ):
             raise Exception(
-                f"VersionError: PaddlePaddle version in [{min_version}, {max_version}] required, but {paddle_version.full_version} installed."
+                f"VersionError: PaddlePaddle version in [{min_version}, {max_version}] required, but {paddle.version.full_version} installed."
             )
     else:
         if version_cmp(version_installed, min_version_to_check) < 0:
             raise Exception(
-                f"VersionError: PaddlePaddle version {min_version} or higher is required, but {paddle_version.full_version} installed, "
+                f"VersionError: PaddlePaddle version {min_version} or higher is required, but {paddle.version.full_version} installed, "
                 f"please upgrade your PaddlePaddle to {min_version} or other higher version."
             )
 
@@ -1617,6 +1616,9 @@ def __init__(
         if name is None:
             name = self.block.program._name_generator("_generated_var")
 
+            while self.block._find_var_recursive(name) is not None:
+                name = self.block.program._name_generator("_generated_var")
+
         if dtype is not None:
             dtype = convert_to_proto_type(dtype)
 
diff --git a/python/paddle/decomposition/decomp.py b/python/paddle/decomposition/decomp.py
index 6ffaebe444c9d..ab06767768271 100644
--- a/python/paddle/decomposition/decomp.py
+++ b/python/paddle/decomposition/decomp.py
@@ -850,13 +850,15 @@ def decompose_dist_program(pir_program):
     decompose(pir_program, [])
 
     # decomp backward ops
+    blacklist = core.prim_config["backward_blacklist"]
+
     block = pir_program.global_block()
+    pre_combine_op = None
     with paddle.pir.core.program_guard(pir_program):
         ops = pir_program.global_block().ops
         for op in ops:
             bwd_op_name = op.name()
-            # todo(CZ): to be removed
-            if bwd_op_name in ["pd_op.mean_grad", "pd_op.concat_grad"]:
+            if bwd_op_name.split(".")[-1] in blacklist:
                 continue
             skip_decomp = False
             if has_decomp_vjp(op):
@@ -867,13 +869,45 @@ def decompose_dist_program(pir_program):
                 if not skip_decomp:
                     pir.set_insertion_point(op)
                     orig_outs = op.results()
+
+                    is_next_split = False
                     decomp_outs = call_decomp_vjp(op)
-                    new_outs = _analyse_decomp_results(
-                        orig_outs, decomp_outs, op
-                    )
-                    op.replace_all_uses_with(new_outs)
+                    for i in range(len(orig_outs)):
+                        if orig_outs[i].has_one_use():
+                            next_op = orig_outs[i].first_use().owner()
+                            if next_op.name() == "builtin.split":
+                                is_next_split = True
+                                _check_op_results(
+                                    next_op.name(),
+                                    next_op.results(),
+                                    decomp_outs[i],
+                                )
+                                next_op.replace_all_uses_with(decomp_outs[i])
+                                block.remove_op(next_op)
+
+                    if not is_next_split:
+                        new_outs = _analyse_decomp_results(
+                            orig_outs, decomp_outs, op
+                        )
+                        _check_op_results(op.name(), orig_outs, new_outs)
+                        op.replace_all_uses_with(new_outs)
+
                     block.remove_op(op)
 
+                if op.name() == "builtin.combine":
+                    pre_combine_op = op
+
+                if pre_combine_op is not None:
+                    remove_op = True
+                    for item in pre_combine_op.results():
+                        if item.has_one_use():
+                            remove_op = False
+                            break
+                    if remove_op:
+                        block.remove_op(pre_combine_op)
+                    pre_combine_op = None
+    paddle.pir.set_insertion_point_to_block_end(block)
+
 
 def decompose_pir_program(pir_program, param_mapping, grad_var_to_var):
     '''
diff --git a/python/paddle/distributed/auto_parallel/static/engine.py b/python/paddle/distributed/auto_parallel/static/engine.py
index 7c2439a059a34..7faa92607719c 100644
--- a/python/paddle/distributed/auto_parallel/static/engine.py
+++ b/python/paddle/distributed/auto_parallel/static/engine.py
@@ -58,6 +58,7 @@
 from .pir_pass import (
     apply_partition_pass,
     apply_reshard_pass,
+    remove_other_rank_op_pass,
     remove_unuseful_comm_op_pass,
 )
 from .planner_v2 import Planner
@@ -696,6 +697,8 @@ def _parallel_pir(self, mode):
         #   collect the communicator created during resolution.
         apply_reshard_pass(dist_program)
 
+        remove_other_rank_op_pass(dist_program)
+
         # Part 4: Optimization Pass
         # NOTE Only those Optimization Pass that related to Parallelism (need dist attr) should be placed here and all the Pass should be Optional.
 
diff --git a/python/paddle/distributed/auto_parallel/static/parallelizer_v2.py b/python/paddle/distributed/auto_parallel/static/parallelizer_v2.py
index 130e80212f274..f5df914650c2c 100644
--- a/python/paddle/distributed/auto_parallel/static/parallelizer_v2.py
+++ b/python/paddle/distributed/auto_parallel/static/parallelizer_v2.py
@@ -478,9 +478,12 @@ def _apply_post_optimization(
             self._strategy.gradient_merge.avg = True
 
         # gradient_merge is then train-only optimization
+        grad_to_global_grad = {}
         if self.is_train and self._strategy.gradient_merge.enable:
             config = copy.deepcopy(self._strategy.gradient_merge.to_dict())
             config["dist_context"] = self._dist_context
+            config["grad_to_global_grad"] = grad_to_global_grad
+            config["pipeline_mode"] = self._strategy.pipeline.schedule_mode
             if gradient_sync_after_accumulate:
                 config["params_grads"] = global_params_grads
                 config[
@@ -557,4 +560,5 @@ def _apply_post_optimization(
                 "vpp_degree": self._strategy.pipeline.vpp_degree,
                 "dist_context": self._dist_context,
                 "split_backward": self._strategy.pipeline.split_backward,
+                "grad_to_global_grad": grad_to_global_grad,
             }
diff --git a/python/paddle/distributed/auto_parallel/static/pir_pass.py b/python/paddle/distributed/auto_parallel/static/pir_pass.py
index cae150f556967..6597aebb2f9de 100644
--- a/python/paddle/distributed/auto_parallel/static/pir_pass.py
+++ b/python/paddle/distributed/auto_parallel/static/pir_pass.py
@@ -89,44 +89,6 @@ def apply_partition_pass(program):
                 var.replace_all_uses_with(reshard_var)
                 reshard_var.get_defining_op().operand(0).set_source(var)
 
-    # pruning op and value not belong to cur rank
-    cur_rank = paddle.distributed.get_rank()
-    for op in program.global_block().ops[::-1]:
-        if op.name() in partition_skip_op_list:
-            can_delete = True
-            for val in op.results():
-                if not val.use_empty():
-                    can_delete = False
-            if can_delete:
-                op.erase()
-            continue
-        if cur_rank not in op.dist_attr.process_mesh.process_ids:
-            op.erase()
-        else:
-            # set the operand as null when it is not belong to cur rank
-            if (
-                op.name() == 'dist_op.reshard'
-                and cur_rank
-                not in op.operand(0)
-                .source()
-                .dist_attr()
-                .process_mesh.process_ids
-            ):
-                op.operand(0).set_source(None)
-
-    # merge pd.data ops for
-    lr_ops = []
-    for op in program.global_block().ops[::-1]:
-        if op.name() == 'pd_op.data' and "learning_rate" in op.attrs()["name"]:
-            lr_ops.append(op)
-
-    if len(lr_ops) > 1:
-        lr_value = lr_ops[0].result(0)
-        for op in lr_ops[1:]:
-            lr = op.result(0)
-            lr.replace_all_uses_with(lr_value)
-            op.erase()
-
 
 def apply_reshard_pass(program):
     for op in program.global_block().ops:
@@ -160,6 +122,40 @@ def apply_reshard_pass(program):
                 op.erase()
 
 
+# pruning op and value not belong to cur rank
+def remove_other_rank_op_pass(dist_program):
+    cur_rank = paddle.distributed.get_rank()
+    for op in dist_program.global_block().ops[::-1]:
+        if op.name() in partition_skip_op_list:
+            can_delete = True
+            for val in op.results():
+                if not val.use_empty():
+                    can_delete = False
+            if can_delete:
+                op.erase()
+            continue
+        if cur_rank not in op.dist_attr.process_mesh.process_ids:
+            op.erase()
+        elif op.name() == "dist_op.reshard":
+            assert op.result(
+                0
+            ).use_empty(), f'There should not have useful dist.reshard op in remove_other_rank_op_pass. but find : {op}'
+            op.erase()
+
+    # merge pd.data ops for
+    lr_ops = []
+    for op in dist_program.global_block().ops[::-1]:
+        if op.name() == 'pd_op.data' and "learning_rate" in op.attrs()["name"]:
+            lr_ops.append(op)
+
+    if len(lr_ops) > 1:
+        lr_value = lr_ops[0].result(0)
+        for op in lr_ops[1:]:
+            lr = op.result(0)
+            lr.replace_all_uses_with(lr_value)
+            op.erase()
+
+
 # Note: this is the pass in the dense program
 comm_ops = ["pd_op.c_allreduce_sum_", "pd_op.c_allgather"]
 
diff --git a/python/paddle/distributed/auto_parallel/static/reshard_funcs/nd_mesh_reshard_func.py b/python/paddle/distributed/auto_parallel/static/reshard_funcs/nd_mesh_reshard_func.py
index cf4b9b7b32af1..bbc9b959b72db 100644
--- a/python/paddle/distributed/auto_parallel/static/reshard_funcs/nd_mesh_reshard_func.py
+++ b/python/paddle/distributed/auto_parallel/static/reshard_funcs/nd_mesh_reshard_func.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import copy
 
 import paddle
 import paddle.distributed as dist
@@ -67,10 +66,12 @@ def get_1D_sub_process_mesh(process_mesh, mesh_dim):
     process_ids = np.array(process_mesh.process_ids).reshape(mesh_shape)
 
     rank_id = dist.get_rank()
+    # FIXME (JZ-LIANG) Remove this hack to support any op mesh group for Pipeline Parallelism
+    if rank_id not in process_mesh.process_ids:
+        rank_id = process_mesh.process_ids[0]
     coord = list(np.where(process_ids == rank_id))
     coord[mesh_dim] = range(mesh_shape[mesh_dim])
     sub_process_ids = process_ids[tuple(coord)].flatten()
-    sub_mesh_shape = sub_process_ids.shape
     sub_mesh_name = dim_names[mesh_dim]
 
     return dist.ProcessMesh(sub_process_ids, [sub_mesh_name])
@@ -106,35 +107,31 @@ def reshard(self, src_dist_attr, dst_dist_attr, src_value, dst_type):
         first_diff_axis = find_first_diff_shard_axis(
             src_dist_attr, dst_dist_attr
         )
-        ori_dst_dist_attr = copy_dist_attr_with_new_member(dst_dist_attr)
-        out_value = src_value  # intermediate result
-        src_type = src_value.type()
+        # out_value = src_value  # intermediate result
+        # src_type = src_value.type()
         tensor_ndim = len(src_value.shape)
         process_mesh = dst_dist_attr.process_mesh
 
         # Step2. Convert the non-replicated dimensions to replicated.
         # Step2.1. convert partial status to replicated
-        real_out_dist_attr = copy_dist_attr_with_new_member(src_dist_attr)
         if is_partial(src_dist_attr):
-            in_partial_status = copy.deepcopy(src_dist_attr.partial_status)
+            in_partial_status = src_dist_attr.partial_status
             out_partial_status = dst_dist_attr.partial_status  # read-only
             # convert each partial dim to replicated with corresponding
             # 1-D mesh function
             for partial_dim, partial_type in in_partial_status.items():
-                if (
-                    partial_dim in out_partial_status
-                    or partial_dim in ori_dst_dist_attr.dims_mapping
-                ):
+                if partial_dim in out_partial_status:
                     continue
 
                 # get the partial status after converting
-                real_out_partial_status = copy.deepcopy(
-                    real_out_dist_attr.partial_status
+                tmp_partial_status = src_dist_attr.partial_status
+                tmp_partial_status.pop(partial_dim)
+                tmp_dst_dist_attr = copy_dist_attr_with_new_member(
+                    src_dist_attr,
+                    new_partial_status=tmp_partial_status,
                 )
-                real_out_partial_status.pop(partial_dim)
-                real_out_dist_attr = copy_dist_attr_with_new_member(
-                    real_out_dist_attr,
-                    new_partial_status=real_out_partial_status,
+                tmp_dst_type = paddle.base.libpaddle.pir.cvt_to_dist_type(
+                    src_value.type(), tmp_dst_dist_attr
                 )
 
                 # get the process_mesh on specific axis
@@ -160,28 +157,29 @@ def reshard(self, src_dist_attr, dst_dist_attr, src_value, dst_type):
                 )
 
                 one_dim_func = PToRReshardFunction()
-                out_value = one_dim_func.reshard(
+                src_value = one_dim_func.reshard(
                     in_one_dim_dist_attr,
                     out_one_dim_dist_attr,
-                    out_value,
-                    src_type,
+                    src_value,
+                    tmp_dst_type,
                 )
-
-                out_value.update_dist_attr(real_out_dist_attr)
+                src_dist_attr = tmp_dst_dist_attr
 
         # Step2.2 convert shard status to replicated
         for i in range(first_diff_axis, -1, -1):
-            in_mesh_axis = real_out_dist_attr.dims_mapping[i]
-            if in_mesh_axis == -1:
+            in_mesh_axis = src_dist_attr.dims_mapping[i]
+            out_mesh_axis = dst_dist_attr.dims_mapping[i]
+            if in_mesh_axis == -1 or in_mesh_axis == out_mesh_axis:
                 continue
 
             # calculate the dist_attr after converting
-            real_out_dims_mapping = copy.deepcopy(
-                real_out_dist_attr.dims_mapping
+            tmp_dims_mapping = src_dist_attr.dims_mapping
+            tmp_dims_mapping[i] = -1
+            tmp_dst_dist_attr = copy_dist_attr_with_new_member(
+                src_dist_attr, new_dims_mapping=tmp_dims_mapping
             )
-            real_out_dims_mapping[i] = -1
-            real_out_dist_attr = copy_dist_attr_with_new_member(
-                real_out_dist_attr, new_dims_mapping=real_out_dims_mapping
+            tmp_dst_type = paddle.base.libpaddle.pir.cvt_to_dist_type(
+                src_value.type(), tmp_dst_dist_attr
             )
 
             # get the process_mesh on specific axis
@@ -205,45 +203,41 @@ def reshard(self, src_dist_attr, dst_dist_attr, src_value, dst_type):
             )
 
             one_dim_func = SToRReshardFunction()
-            out_value = one_dim_func.reshard(
-                in_one_dim_dist_attr, out_one_dim_dist_attr, out_value, src_type
+            src_value = one_dim_func.reshard(
+                in_one_dim_dist_attr,
+                out_one_dim_dist_attr,
+                src_value,
+                tmp_dst_type,
             )
-
-            out_value.update_dist_attr(real_out_dist_attr)
+            src_dist_attr = tmp_dst_dist_attr
 
         # Step3. Convert the replicated status to the status in dst_dist_attr
         # Step3.1 convert replicated to partial
-        if is_partial(ori_dst_dist_attr):
-            in_partial_status = out_value.dist_attr.partial_status
-            out_partial_status = ori_dst_dist_attr.partial_status
+        if is_partial(dst_dist_attr):
+            in_partial_status = src_dist_attr.partial_status
+            out_partial_status = dst_dist_attr.partial_status
             for partial_dim, partial_type in out_partial_status.items():
                 if partial_dim in in_partial_status:
                     continue
-
                 raise NotImplementedError(
                     "RToPReshardFunction is not implemented"
                 )
 
-        # Step3.2 convert replicated/partial to shard
+        # Step3.2 convert replicated to shard
         for i in range(first_diff_axis, -1, -1):
-            out_mesh_axis = ori_dst_dist_attr.dims_mapping[i]
-            if out_mesh_axis == -1:
+            in_mesh_axis = src_dist_attr.dims_mapping[i]
+            out_mesh_axis = dst_dist_attr.dims_mapping[i]
+            if in_mesh_axis == out_mesh_axis:
                 continue
-            in_partial_status = out_value.dist_attr().partial_status
-            need_p2s = out_mesh_axis in in_partial_status
-            dims_mapping = copy.deepcopy(real_out_dist_attr.dims_mapping)
-            dims_mapping[i] = out_mesh_axis
-            partial_status = None
-            if out_mesh_axis in real_out_dist_attr.partial_status:
-                partial_status = copy.deepcopy(
-                    real_out_dist_attr.partial_status
-                )
-                partial_status.pop(out_mesh_axis)
 
-            real_out_dist_attr = copy_dist_attr_with_new_member(
-                real_out_dist_attr,
-                new_dims_mapping=dims_mapping,
-                new_partial_status=partial_status,
+            # calculate the dist_attr after converting
+            tmp_dims_mapping = src_dist_attr.dims_mapping
+            tmp_dims_mapping[i] = out_mesh_axis
+            tmp_dst_dist_attr = copy_dist_attr_with_new_member(
+                src_dist_attr, new_dims_mapping=tmp_dims_mapping
+            )
+            tmp_dst_type = paddle.base.libpaddle.pir.cvt_to_dist_type(
+                src_value.type(), tmp_dst_dist_attr
             )
 
             # get the process_mesh on specific axis
@@ -265,23 +259,15 @@ def reshard(self, src_dist_attr, dst_dist_attr, src_value, dst_type):
                     sub_mesh, out_one_dim_dims_mapping, {}
                 )
             )
-
-            if need_p2s:
-                raise NotImplementedError(
-                    "PToSReshardFunction is not implemented"
-                )
-            else:
-                one_dim_func = RToSReshardFunction()
-                out_value = one_dim_func.reshard(
-                    in_one_dim_dist_attr,
-                    out_one_dim_dist_attr,
-                    out_value,
-                    dst_type,
-                )
-                out_value.update_dist_attr(real_out_dist_attr)
-
-        out_value.set_type(dst_type)
-        return out_value
+            one_dim_func = RToSReshardFunction()
+            src_value = one_dim_func.reshard(
+                in_one_dim_dist_attr,
+                out_one_dim_dist_attr,
+                src_value,
+                tmp_dst_type,
+            )
+            src_dist_attr = tmp_dst_dist_attr
+        return src_value
 
 
 class NdMeshReshardFunctionCrossMesh(ReshardFunction):
@@ -310,20 +296,14 @@ def reshard(self, src_dist_attr, dst_dist_attr, src_value, dst_type):
         tmp_dst_type = paddle.base.libpaddle.pir.cvt_to_dist_type(
             src_value.type(), tmp_dist_attr
         )
-        out_value = same_status_func.reshard(
+        src_value = same_status_func.reshard(
             src_dist_attr, tmp_dist_attr, src_value, tmp_dst_type
         )
 
-        if out_value is None:
-            return None
-
-        curr_global_rank = paddle.distributed.get_rank()
-        if curr_global_rank in dst_dist_attr.process_mesh.process_ids:
-            nd_mesh_func = NdMeshReshardFunction()
-            assert nd_mesh_func.is_suitable(
-                tmp_dist_attr, dst_dist_attr
-            ), f"Invoke the p to r reshard function is not valid from {tmp_dist_attr} to {dst_dist_attr}"
-            return nd_mesh_func.reshard(
-                tmp_dist_attr, dst_dist_attr, out_value, dst_type
-            )
-        return None
+        nd_mesh_func = NdMeshReshardFunction()
+        assert nd_mesh_func.is_suitable(
+            tmp_dist_attr, dst_dist_attr
+        ), f"Invoke the p to r reshard function is not valid from {tmp_dist_attr} to {dst_dist_attr}"
+        return nd_mesh_func.reshard(
+            tmp_dist_attr, dst_dist_attr, src_value, dst_type
+        )
diff --git a/python/paddle/distributed/auto_parallel/static/reshard_funcs/p_to_r_reshard_func.py b/python/paddle/distributed/auto_parallel/static/reshard_funcs/p_to_r_reshard_func.py
index 8956cc2535d9b..d5046ff0f7963 100644
--- a/python/paddle/distributed/auto_parallel/static/reshard_funcs/p_to_r_reshard_func.py
+++ b/python/paddle/distributed/auto_parallel/static/reshard_funcs/p_to_r_reshard_func.py
@@ -47,7 +47,7 @@ def reshard(self, src_dist_attr, dst_dist_attr, src_value, dst_type):
             src_reduce_type = ReduceOp.SUM
             reduce_mean = True
 
-        group = new_process_group(src_mesh.process_ids)
+        group = new_process_group(sorted(src_mesh.process_ids))
         reduced_value = paddle._C_ops.c_allreduce_sum_(
             src_value, group.id, True, False
         )
@@ -95,20 +95,14 @@ def reshard(self, src_dist_attr, dst_dist_attr, src_value, dst_type):
         tmp_dst_type = paddle.base.libpaddle.pir.cvt_to_dist_type(
             src_value.type(), tmp_dist_attr
         )
-        out_value = same_status_func.reshard(
+        src_value = same_status_func.reshard(
             src_dist_attr, tmp_dist_attr, src_value, tmp_dst_type
         )
 
-        if out_value is None:
-            return None
-
-        curr_global_rank = paddle.distributed.get_rank()
-        if curr_global_rank in dst_dist_attr.process_mesh.process_ids:
-            p_to_r_func = PToRReshardFunction()
-            assert p_to_r_func.is_suitable(
-                tmp_dist_attr, dst_dist_attr
-            ), f"Invoke the p to r reshard function is not valid from {tmp_dist_attr} to {dst_dist_attr}"
-            return p_to_r_func.reshard(
-                tmp_dist_attr, dst_dist_attr, out_value, dst_type
-            )
-        return None
+        p_to_r_func = PToRReshardFunction()
+        assert p_to_r_func.is_suitable(
+            tmp_dist_attr, dst_dist_attr
+        ), f"Invoke the p to r reshard function is not valid from {tmp_dist_attr} to {dst_dist_attr}"
+        return p_to_r_func.reshard(
+            tmp_dist_attr, dst_dist_attr, src_value, dst_type
+        )
diff --git a/python/paddle/distributed/auto_parallel/static/reshard_funcs/r_to_s_reshard_func.py b/python/paddle/distributed/auto_parallel/static/reshard_funcs/r_to_s_reshard_func.py
index 922df440c5a21..3b54fa4d8a728 100644
--- a/python/paddle/distributed/auto_parallel/static/reshard_funcs/r_to_s_reshard_func.py
+++ b/python/paddle/distributed/auto_parallel/static/reshard_funcs/r_to_s_reshard_func.py
@@ -59,15 +59,17 @@ def reshard(self, src_dist_attr, dst_dist_attr, src_value, dst_type):
 
             out_value = paddle.slice(src_value, [split_axis], [start], [end])
 
-            out_value.set_type(src_value.type())
-            out_value.update_dist_attr(dst_dist_attr)
+            out_value.set_type(dst_type)
             out_value.get_defining_op().dist_attr = (
                 paddle.base.libpaddle.pir.create_op_dist_attribute(
                     mesh, [src_dist_attr], [dst_dist_attr]
                 )
             )
             return out_value
-        return None
+        # fake var will be removed in remove_other_rank_op_pass.
+        fake_var = paddle._C_ops.reshard_v2(src_value, dst_dist_attr)
+        fake_var.set_type(dst_type)
+        return fake_var
 
 
 class RToSReshardFunctionCrossMesh(ReshardFunction):
diff --git a/python/paddle/distributed/auto_parallel/static/reshard_funcs/s_to_r_reshard_func.py b/python/paddle/distributed/auto_parallel/static/reshard_funcs/s_to_r_reshard_func.py
index 5a907839cf78b..42d92392b65c9 100644
--- a/python/paddle/distributed/auto_parallel/static/reshard_funcs/s_to_r_reshard_func.py
+++ b/python/paddle/distributed/auto_parallel/static/reshard_funcs/s_to_r_reshard_func.py
@@ -102,8 +102,7 @@ def get_split_axis_with_dims_mapping(dims_mapping):
             return new_value
         else:
             # TODO(ywt01) support unbalanced split
-            pass
-        return None
+            raise NotImplementedError("unbalanced split is not implemented")
 
     def reshard_s_to_r_with_padding(
         self,
@@ -116,8 +115,8 @@ def reshard_s_to_r_with_padding(
     ):
         src_mesh = src_dist_attr.process_mesh
         num_of_process = len(src_mesh.process_ids)
-        dtype = src_value.dtype
-        group = new_process_group(src_mesh.process_ids)
+
+        group = new_process_group(sorted(src_mesh.process_ids))
         allgather_value = paddle._C_ops.c_allgather(
             src_value, group.id, num_of_process, True
         )
@@ -138,11 +137,32 @@ def reshard_s_to_r_with_padding(
 
         if split_axis != 0 or padding_num != 0:
             allgather_op = allgather_value.get_defining_op()
-            paddle.pir.set_insertion_point_after(allgather_op)
-            split_value = paddle._C_ops.split_with_num(
+            split_values = paddle._C_ops.split_with_num(
                 allgather_op.result(0), num_of_process, 0
             )
-            concat_value = paddle._C_ops.concat(split_value, split_axis)
+            builtin_split_op = split_values[0].get_defining_op()
+            pd_splite_op = builtin_split_op.operand_source(0).get_defining_op()
+
+            # fix the split_with_num dist attribtue.
+            new_inner_types = []
+            for sub_value in split_values:
+                new_inner_type = paddle.base.libpaddle.pir.cvt_to_dist_type(
+                    sub_value.type(), allgather_value.dist_attr()
+                )
+                new_inner_types.append(new_inner_type)
+                sub_value.set_type(new_inner_type)
+            vec_type = paddle.base.libpaddle.pir.create_vec_type(
+                new_inner_types
+            )
+            pd_splite_op.result(0).set_type(vec_type)
+
+            concat_value = paddle._C_ops.concat(split_values, split_axis)
+            # fold builtin.split op and builtin.combine op
+            concat_op = concat_value.get_defining_op()
+            builtin_combine_op = concat_op.operand_source(0).get_defining_op()
+            concat_op.operand(0).set_source(pd_splite_op.result(0))
+            builtin_combine_op.erase()
+            builtin_split_op.erase()
             return concat_value
         return allgather_value
 
@@ -183,16 +203,11 @@ def reshard(self, src_dist_attr, dst_dist_attr, src_value, dst_type):
         out_value = same_status_func.reshard(
             src_dist_attr, tmp_dist_attr, src_value, tmp_dst_type
         )
-        if out_value is None:
-            return None
-
-        curr_global_rank = paddle.distributed.get_rank()
-        if curr_global_rank in dst_dist_attr.process_mesh.process_ids:
-            s_to_r_func = SToRReshardFunction()
-            assert s_to_r_func.is_suitable(
-                tmp_dist_attr, dst_dist_attr
-            ), f"Invoke the p to r reshard function is not valid from {tmp_dist_attr} to {dst_dist_attr}"
-            return s_to_r_func.reshard(
-                tmp_dist_attr, dst_dist_attr, out_value, dst_type
-            )
-        return None
+
+        s_to_r_func = SToRReshardFunction()
+        assert s_to_r_func.is_suitable(
+            tmp_dist_attr, dst_dist_attr
+        ), f"Invoke the p to r reshard function is not valid from {tmp_dist_attr} to {dst_dist_attr}"
+        return s_to_r_func.reshard(
+            tmp_dist_attr, dst_dist_attr, out_value, dst_type
+        )
diff --git a/python/paddle/distributed/auto_parallel/static/reshard_funcs/same_status_reshard_func.py b/python/paddle/distributed/auto_parallel/static/reshard_funcs/same_status_reshard_func.py
index ceae2e7424fd6..db6ec8d1df238 100644
--- a/python/paddle/distributed/auto_parallel/static/reshard_funcs/same_status_reshard_func.py
+++ b/python/paddle/distributed/auto_parallel/static/reshard_funcs/same_status_reshard_func.py
@@ -87,11 +87,14 @@ def reshard(self, src_dist_attr, dst_dist_attr, src_value, dst_type):
                         dst_mesh, [], [dst_dist_attr]
                     )
                 )
-                recv_value.update_dist_attr(dst_dist_attr)
+                recv_value.set_type(dst_type)
                 is_send = False
                 break
 
         if is_send:
-            return None
+            # fake var will be removed in remove_other_rank_op_pass.
+            fake_var = paddle._C_ops.reshard_v2(src_value, dst_dist_attr)
+            fake_var.set_type(dst_type)
+            return fake_var
         else:
             return recv_value
diff --git a/python/paddle/distributed/auto_tuner/utils.py b/python/paddle/distributed/auto_tuner/utils.py
index 741120f7fe598..2db4cb6e0bdcc 100644
--- a/python/paddle/distributed/auto_tuner/utils.py
+++ b/python/paddle/distributed/auto_tuner/utils.py
@@ -1068,7 +1068,10 @@ def _gen_new_arg(arg, cmd, cfg, res_args, tuner_cfg):
                         prefix + str(cfg[arg]) if prefix else cfg[arg]
                     )
                 json.dump(cmd_cfg, open(cmd[arg][0], "w"))
-                if tuner_cfg["run_cmd"].get("generate_launch_cfg", True):
+                if (
+                    tuner_cfg["run_cmd"].get("generate_launch_cfg", True)
+                    and not run_best
+                ):
                     new_cmd_apth = (
                         os.path.splitext(cmd[arg][0])[0]
                         + "_"
@@ -1107,7 +1110,10 @@ def _gen_new_arg(arg, cmd, cfg, res_args, tuner_cfg):
                         prefix + str(cfg[arg]) if prefix else cfg[arg]
                     )
                 yaml.dump(cmd_cfg, open(cmd[arg][0], "w"))
-                if tuner_cfg["run_cmd"].get("generate_launch_cfg", True):
+                if (
+                    tuner_cfg["run_cmd"].get("generate_launch_cfg", True)
+                    and not run_best
+                ):
                     new_cmd_apth = (
                         os.path.splitext(cmd[arg][0])[0]
                         + cfg["log_dir_name"]
@@ -1157,7 +1163,10 @@ def _gen_new_arg(arg, cmd, cfg, res_args, tuner_cfg):
                 else:
                     cmd_cfg[keys[-1]] = rr_values
                 json.dump(cmd_cfg, open(cmd[arg][0], "w"))
-                if tuner_cfg["run_cmd"].get("generate_launch_cfg", True):
+                if (
+                    tuner_cfg["run_cmd"].get("generate_launch_cfg", True)
+                    and not run_best
+                ):
                     new_cmd_apth = (
                         os.path.splitext(cmd[arg][0])[0]
                         + cfg["log_dir_name"]
@@ -1198,7 +1207,10 @@ def _gen_new_arg(arg, cmd, cfg, res_args, tuner_cfg):
                 else:
                     cmd_cfg[keys[-1]] = rr_values
                 yaml.dump(cmd_cfg, open(cmd[arg][0], "w"))
-                if tuner_cfg["run_cmd"].get("generate_launch_cfg", True):
+                if (
+                    tuner_cfg["run_cmd"].get("generate_launch_cfg", True)
+                    and not run_best
+                ):
                     new_cmd_apth = (
                         os.path.splitext(cmd[arg][0])[0]
                         + cfg["log_dir_name"]
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
index 63f76416142c1..ba4c61a1f917a 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
@@ -812,6 +812,7 @@ def copy_attr(attr_name):
         copy_attr("optimize_attr")
         copy_attr("do_model_average")
         copy_attr("need_clip")
+        copy_attr("no_sync")
 
         self._slice_params[param.name] = slice_param
         return slice_param
diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
index 33b8c3d95d582..db8c2f7b9b820 100644
--- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
@@ -24,6 +24,7 @@
     paddle.float32: "float32",
     paddle.float64: "float64",
     paddle.bfloat16: "bfloat16",
+    paddle.bool: "bool",
 }
 
 PADDLE_TO_NUMBER = {
@@ -33,6 +34,7 @@
     paddle.int32: 3,
     paddle.int64: 4,
     paddle.bfloat16: 5,
+    paddle.bool: 6,
 }
 
 NUMBER_TO_DTYPE = {
@@ -42,6 +44,7 @@
     3: "int32",
     4: "int64",
     5: "bfloat16",
+    6: "bool",
 }
 
 
diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
index 816af6f91530d..53d929c7890bd 100644
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -451,7 +451,9 @@ def check_layer_sparse(sublayer):
             return False
 
         is_sparse_gradient = [
-            check_layer_sparse(sublayer) for sublayer, _ in layers_param
+            check_layer_sparse(sublayer)
+            for sublayer, param in layers_param
+            if not getattr(param, "no_sync", False)
         ]
 
         if in_dynamic_mode():
diff --git a/python/paddle/distributed/passes/allreduce_matmul_grad_overlapping.py b/python/paddle/distributed/passes/allreduce_matmul_grad_overlapping.py
index 77affd4cd9c1e..e22cc5bbf6d65 100644
--- a/python/paddle/distributed/passes/allreduce_matmul_grad_overlapping.py
+++ b/python/paddle/distributed/passes/allreduce_matmul_grad_overlapping.py
@@ -138,8 +138,14 @@ def _split_matmul_grad_and_multi_streaming_allreduce(
                 name: allreduce_op.output(name) for name in allreduce_op_outputs
             }
 
+            # matmul_v2 + reshape + reshape + matmul_v2 + reshape + ... + original c_allreduce_sum
+            # =>
+            # matmul_v2 + new c_allreduce_sum + reshape + reshape + matmul_v2 + reshape + ... + original c_allreduce_sum
+            #
+            # NOTE(liym27): new c_allreduce_sum must be inserted to "the next of the first matmul_v2", otherwise another
+            # pass fused_linear_param_grad_add will not work.
             allreduce_op = block._insert_op_without_sync(
-                index=allreduce_id + 1,
+                index=matmul_grad_id + 1,
                 type=allreduce_op.type,
                 inputs=allreduce_op_inputs,
                 outputs=allreduce_op_outputs,
diff --git a/python/paddle/distributed/passes/auto_parallel_gradient_merge.py b/python/paddle/distributed/passes/auto_parallel_gradient_merge.py
index aab9bdb2456a0..2d7413965ae3b 100644
--- a/python/paddle/distributed/passes/auto_parallel_gradient_merge.py
+++ b/python/paddle/distributed/passes/auto_parallel_gradient_merge.py
@@ -523,6 +523,8 @@ def parse_program(
         dist_context,
     )
 
+    return grad_to_gradient_merge
+
 
 @register_pass("auto_parallel_gradient_merge_pass")
 class GradientMergePass(PassBase):
@@ -550,8 +552,9 @@ def _apply_single_impl(self, main_program, startup_program, context):
         gradient_sync_after_accumulate = self.get_attr(
             "gradient_sync_after_accumulate", False
         )
+        grad_to_global_grad = self.get_attr("grad_to_global_grad", {})
         with paddle.static.program_guard(main_program, startup_program):
-            parse_program(
+            grad_to_merge_grad = parse_program(
                 main_program,
                 startup_program,
                 params_grads,
@@ -562,3 +565,5 @@ def _apply_single_impl(self, main_program, startup_program, context):
             )
 
         main_program._sync_with_cpp()
+        for k, v in grad_to_merge_grad.items():
+            grad_to_global_grad[k] = v
diff --git a/python/paddle/distributed/passes/pipeline_scheduler_pass/pipeline_vpp.py b/python/paddle/distributed/passes/pipeline_scheduler_pass/pipeline_vpp.py
index 4fc9a1ec28692..8bc29411269ab 100644
--- a/python/paddle/distributed/passes/pipeline_scheduler_pass/pipeline_vpp.py
+++ b/python/paddle/distributed/passes/pipeline_scheduler_pass/pipeline_vpp.py
@@ -15,6 +15,10 @@
 import logging
 
 from paddle.base import core
+from paddle.distributed.auto_parallel.static.operators.common import (
+    is_data_parallel_reduce_op,
+    is_data_parallel_scale_op,
+)
 
 from ...utils.log_utils import get_logger
 from ..pass_base import register_pass
@@ -36,7 +40,8 @@
 class PipelineVirtualPipelinePass(PipelinePassBase):
     def __init__(self):
         super().__init__()
-
+        self._real_overlap_sharding_reduce = False
+        self.reduce_comm_suffix = "_reduce"
         self._forward_micro_step_counter = {}
         self._backward_micro_step_counter = {}
 
@@ -137,10 +142,22 @@ def _get_virtual_pp_rank(micro_step, forward):
         if real_split_backward:
             for chunk_id in range(num_model_chunks - 1, -1, -1):
                 for micro_batch_id in range(0, accumulate_steps):
-                    w_job = core.Job(BACKWARD + "_w" + str(chunk_id))
+                    if (
+                        self._real_overlap_sharding_reduce
+                        and micro_batch_id == accumulate_steps - 1
+                    ):
+                        w_job = core.Job(
+                            BACKWARD
+                            + "_w"
+                            + str(chunk_id)
+                            + self.reduce_comm_suffix
+                        )
+                    else:
+                        w_job = core.Job(BACKWARD + "_w" + str(chunk_id))
                     w_job.set_micro_batch_id(micro_batch_id)
                     job_list.append(w_job)
-
+        job_types = [job.type() for job in job_list]
+        logger.debug(f"The VPP job list: {job_types}")
         opt_job = core.Job(OPT)
         job_list.append(opt_job)
         return job_list
@@ -162,6 +179,102 @@ def _split_matmul_grad_ops_to_matmul(self, program, dist_context):
                     block, matmul_grad_id, dist_context=dist_context
                 )
 
+    def _move_sharding_comm_to_backward(
+        self, types, sub_programs, global_grads
+    ):
+        def _get_sharding_comm_op(op, idx, ops):
+            if is_data_parallel_reduce_op(op):
+                op_input_names = op.desc.input_arg_names()
+                op_output_names = op.desc.output_arg_names()
+                if (
+                    op_input_names[0] == op_output_names[0]
+                    and op_input_names[0] in global_grads
+                ):
+                    global_grad_to_comm_op[op_input_names[0]] = [op]
+                    remove_op_ids.append(idx)
+
+                if op.type in ["c_allreduce_sum", "c_reduce_sum"]:
+                    scale_index = idx + 1
+                    if scale_index < len(len(ops)):
+                        if is_data_parallel_scale_op(ops[scale_index]):
+                            global_grad_to_comm_op[op_input_names[0]].append(op)
+                            remove_op_ids.append(scale_index)
+
+        def _get_scale_op(op, idx):
+            if is_data_parallel_scale_op(op):
+                return
+            if op.type == 'scale':
+                op_input_names = op.desc.input_arg_names()
+                op_output_names = op.desc.output_arg_names()
+                if (
+                    op_input_names[0] == op_output_names[0]
+                    and op_input_names[0] in global_grads
+                ):
+                    global_grad_to_scale_op[op_input_names[0]] = op
+                    remove_op_ids.append(idx)
+
+        # 1 get the all sharding_avg in optimizer
+        type_programs = dict(zip(types, sub_programs))
+        opt_program = type_programs["optimizer"]
+        global_grad_to_comm_op = {}
+        global_grad_to_scale_op = {}
+        all_remove_op_ids = []
+        for cur_block in opt_program.blocks:
+            remove_op_ids = []
+            for idx, op in enumerate(cur_block.ops):
+                _get_scale_op(op, idx)
+                _get_sharding_comm_op(op, idx, cur_block.ops)
+            all_remove_op_ids.append(remove_op_ids)
+        if len(global_grad_to_comm_op) == 0:  # no need to overlap sharding comm
+            return False
+
+        # 2 create the new backward(w) with the sharding_comm
+        new_types = []
+        new_programs = []
+        for type, sub_program in type_programs.items():
+            if "backward_w" in type:
+                new_program = sub_program.clone()
+                cur_block = new_program.global_block()
+                cur_block_scale_op = []
+                for idx, op in reversed(list(enumerate(cur_block.ops))):
+                    if op.type == "elementwise_add":
+                        input_arg_names = op.input_arg_names
+                        output_arg_names = op.output_arg_names
+                        if (
+                            input_arg_names[0] == output_arg_names[0]
+                            and input_arg_names[0] in global_grad_to_comm_op
+                        ):
+                            for origin_op in reversed(
+                                global_grad_to_comm_op[input_arg_names[0]]
+                            ):
+                                new_op = cur_block._insert_op_without_sync(
+                                    index=idx + 1, type="nop"
+                                )
+                                new_op.desc.copy_from(origin_op.desc)
+                            del global_grad_to_comm_op[input_arg_names[0]]
+                            cur_block_scale_op.append(
+                                global_grad_to_scale_op[input_arg_names[0]]
+                            )
+                for origin_op in cur_block_scale_op:
+                    new_op = cur_block.append_op(type="nop")
+                    new_op.desc.copy_from(origin_op.desc)
+                cur_block._sync_with_cpp()
+                new_types.append(type + self.reduce_comm_suffix)
+                new_programs.append(new_program)
+        assert (
+            len(global_grad_to_comm_op) == 0
+        ), f"global_grad_to_comm_op must be used up, but left: {global_grad_to_comm_op}"
+
+        types.extend(new_types)
+        sub_programs.extend(new_programs)
+
+        for id, cur_block in enumerate(opt_program.blocks):
+            for op_id in reversed(all_remove_op_ids[id]):
+                cur_block._remove_op(op_id)
+            cur_block._sync_with_cpp()
+
+        return True
+
     def _partial_programs(self, program):
         dist_context = self.get_attr("dist_context")
         num_model_chunks = self.get_attr("vpp_degree")
@@ -169,7 +282,10 @@ def _partial_programs(self, program):
         accumulate_steps = self.get_attr("num_micro_batches")
         num_stages = self.get_attr("pp_degree")
         split_backward = self.get_attr("split_backward", False)
-
+        grad_to_global_grad = self.get_attr("grad_to_global_grad", {})
+        global_grads = [
+            global_grad for _, global_grad in grad_to_global_grad.items()
+        ]
         if split_backward and accumulate_steps == num_stages:
             self._split_matmul_grad_ops_to_matmul(program, dist_context)
             types, sub_program_list = _program_for_vpp_split_bwk(
@@ -178,6 +294,11 @@ def _partial_programs(self, program):
                 dist_context,
                 enable_send_recv_overlap,
             )
+            self._real_overlap_sharding_reduce = (
+                self._move_sharding_comm_to_backward(
+                    types, sub_program_list, global_grads
+                )
+            )
         else:
             types, sub_program_list = _program_for_vpp(
                 program,
diff --git a/python/paddle/distribution/__init__.py b/python/paddle/distribution/__init__.py
index 246c4ffb71173..168fbc460d5bd 100644
--- a/python/paddle/distribution/__init__.py
+++ b/python/paddle/distribution/__init__.py
@@ -34,6 +34,7 @@
 from .multivariate_normal import MultivariateNormal
 from .normal import Normal
 from .poisson import Poisson
+from .student_t import StudentT
 from .transform import (  # noqa:F401
     AbsTransform,
     AffineTransform,
@@ -77,6 +78,7 @@
     'Geometric',
     'Binomial',
     'Poisson',
+    'StudentT',
 ]
 
 __all__.extend(transform.__all__)
diff --git a/python/paddle/distribution/student_t.py b/python/paddle/distribution/student_t.py
new file mode 100644
index 0000000000000..d1a88887023ff
--- /dev/null
+++ b/python/paddle/distribution/student_t.py
@@ -0,0 +1,277 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from collections.abc import Sequence
+
+import paddle
+from paddle.base.data_feeder import check_type, convert_dtype
+from paddle.base.framework import Variable
+from paddle.distribution import Gamma, distribution
+from paddle.framework import in_dynamic_mode
+
+
+class StudentT(distribution.Distribution):
+    r"""
+    The StudentT distribution with parameters: `df`, `loc`, `scale`.
+
+    In probability theory and statistics, the StudentT distribution is one of the basic continuous probability distributions
+    defined on the real number set.
+
+    The probability density function (pdf) is
+
+    .. math::
+
+        pdf(x; \nu, \mu, \sigma) = \frac{\Gamma[(\nu+1)/2]}{\sigma\sqrt{\nu\pi}\Gamma(\nu/2)[1+(\frac{x-\mu}{\sigma})^2/\nu]^{(1+\nu)/2}}
+
+    In the above equation:
+
+    * :math:`df = \nu`: is the degree of freedom.
+    * :math:`loc = \mu`: is the center parameter.
+    * :math:`scale = \sigma`: is the scale parameter.
+    * :math:`\Gamma(\cdot)`: is the gamma function.
+
+    Args:
+        df (float|Tensor): The degree of freedom of the distribution, which should be non-negative. If the input data type is float,
+            the data type of `df` will be converted to a 1-D Tensor with paddle global default dtype. Supported dtype: float32, float64.
+        loc (float|Tensor): The center of the distribution. If the input data type is float, the data type of `loc` will be converted to a
+            1-D Tensor with paddle global default dtype. Supported dtype: float32, float64.
+        scale (float|Tensor): The scale of the distribution, which should be non-negative. If the input data type is float, the data type
+            of `scale` will be converted to a 1-D Tensor with paddle global default dtype. Supported dtype: float32, float64.
+        name(str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Examples:
+        .. code-block:: python
+
+            >>> import paddle
+            >>> from paddle.distribution import StudentT
+            >>> paddle.set_device('cpu')
+            >>> paddle.seed(100)
+            >>> dist = StudentT(df=10.0, loc=0.0, scale=1.0)
+            >>> dist.sample([3])
+            Tensor(shape=[3, 1], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[-2.07709980],
+             [ 0.27981189],
+             [ 0.00881413]])
+
+            >>> dist2 = StudentT(df=paddle.to_tensor([10.0, 5.0]), loc=paddle.to_tensor([0.0, 0.0]), scale=paddle.to_tensor([1.0, 2.0]))
+            >>> value_tensor = paddle.to_tensor([0.8], dtype="float32")
+            >>> lp = dist2.log_prob(value_tensor)
+            >>> print(lp)
+            Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [-1.28509235, -1.75626254])
+
+            >>> p = dist2.prob(value_tensor)
+            >>> print(p)
+            Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [0.27662504, 0.17268908])
+
+            >>> entropy = dist2.entropy()
+            >>> print(entropy)
+            Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [1.52126312, 2.32064891])
+
+    """
+
+    def __init__(self, df, loc, scale, name=None):
+        if not in_dynamic_mode():
+            check_type(
+                df,
+                'df',
+                (
+                    float,
+                    Variable,
+                    paddle.pir.Value,
+                ),
+                'StudentT',
+            )
+            check_type(
+                loc,
+                'loc',
+                (
+                    float,
+                    Variable,
+                    paddle.pir.Value,
+                ),
+                'StudentT',
+            )
+            check_type(
+                scale,
+                'scale',
+                (
+                    float,
+                    Variable,
+                    paddle.pir.Value,
+                ),
+                'StudentT',
+            )
+
+        self.name = name if name is not None else 'StudentT'
+        self.dtype = paddle.get_default_dtype()
+
+        if self._validate_args(df, loc, scale):
+            self.df = df
+            self.loc = loc
+            self.scale = scale
+            self.df, self.loc, self.scale = paddle.broadcast_tensors(
+                [self.df, self.loc, self.scale]
+            )
+            self.dtype = convert_dtype(df.dtype)
+        else:
+            self.df, self.loc, self.scale = self._to_tensor(df, loc, scale)
+
+        if not self._check_nonnegative(self.df):
+            raise ValueError(
+                'Every element of input parameter `df` should be nonnegative.'
+            )
+        if not self._check_nonnegative(self.scale):
+            raise ValueError(
+                'Every element of input parameter `scale` should be nonnegative.'
+            )
+
+        if self.df.shape == []:
+            self.df = self.df.reshape([1])
+            self.loc = self.loc.reshape([1])
+            self.scale = self.scale.reshape([1])
+        batch_shape = self.df.shape
+        super().__init__(batch_shape)
+        self._chi2 = Gamma(0.5 * self.df, paddle.full_like(self.df, 0.5))
+
+    def _check_nonnegative(self, value):
+        """Check the non-negative constraint for input parameters
+
+        Args:
+            value (Tensor)
+
+        Returns:
+            bool: pass or not.
+        """
+        return (value >= 0.0).all()
+
+    @property
+    def mean(self):
+        """Mean of StudentT distribution.
+
+        Returns:
+            Tensor: mean value.
+        """
+        return paddle.where(
+            self.df > 1.0,
+            self.loc,
+            paddle.full_like(self.loc, fill_value=float('nan')),
+        )
+
+    @property
+    def variance(self):
+        """Variance of StudentT distribution.
+
+        Returns:
+            Tensor: variance value.
+        """
+        var = self.df.clone().detach()
+        var_condition = self.df > 2.0
+        var = paddle.where(
+            var_condition,
+            self.scale.pow(2) * var / (var - 2),
+            paddle.full_like(var, fill_value=float('nan')),
+        )
+        inf_condition = (self.df <= 2.0).logical_and(self.df > 1.0)
+        var = paddle.where(
+            inf_condition, paddle.full_like(var, fill_value=float('inf')), var
+        )
+        return var
+
+    def sample(self, shape=()):
+        """Generate StudentT samples of the specified shape. The final shape would be ``shape+batch_shape`` .
+
+        Args:
+            shape (Sequence[int], optional): Prepended shape of the generated samples.
+
+        Returns:
+            Tensor: Sampled data with shape `sample_shape` + `batch_shape`.
+        """
+        if not isinstance(shape, Sequence):
+            raise TypeError('sample shape must be Sequence object.')
+
+        output_shape = self._extend_shape(shape)
+        z = paddle.cast(paddle.normal(shape=output_shape), self.dtype)
+        chi2 = self._chi2.sample(shape)
+        x = z * paddle.rsqrt(chi2 / self.df)
+        return self.loc + self.scale * x
+
+    def entropy(self):
+        r"""Shannon entropy in nats.
+
+        The entropy is
+
+        .. math::
+
+            H = \log(\frac{\Gamma(\nu/2)\Gamma(1/2) \sigma \sqrt{\nu}}{\Gamma[(1+\nu)/2]}) + \frac{(1+\nu)}{2} \cdot \{\psi[(1+\nu)/2] - \psi(\nu/2)\}
+
+        In the above equation:
+
+        * :math:`\nu`: is the degree of freedom.
+        * :math:`\Gamma()`: is the gamma function.
+        * :math:`\psi()`: is the digamma function.
+
+        Returns:
+            Tensor: Shannon entropy of StudentT distribution. The data type is the same as `df`.
+        """
+        lbeta = (
+            paddle.lgamma(0.5 * self.df)
+            + math.lgamma(0.5)
+            - paddle.lgamma(0.5 * (self.df + 1))
+        )
+        return (
+            self.scale.log()
+            + 0.5
+            * (self.df + 1)
+            * (
+                paddle.digamma(0.5 * (self.df + 1))
+                - paddle.digamma(0.5 * self.df)
+            )
+            + 0.5 * self.df.log()
+            + lbeta
+        )
+
+    def log_prob(self, value):
+        """Log probability density function.
+
+        Args:
+          value (Tensor): The input tensor.
+
+        Returns:
+          Tensor: log probability density. The data type is the same as `df`.
+        """
+        value = self._check_values_dtype_in_probs(self.df, value)
+        y = (value - self.loc) / self.scale
+        Z = (
+            self.scale.log()
+            + 0.5 * self.df.log()
+            + 0.5 * math.log(math.pi)
+            + paddle.lgamma(0.5 * self.df)
+            - paddle.lgamma(0.5 * (self.df + 1.0))
+        )
+        return -0.5 * (self.df + 1.0) * paddle.log1p(y**2.0 / self.df) - Z
+
+    def prob(self, value):
+        """Probability density function.
+
+        Args:
+            value (Tensor): The input tensor.
+
+        Returns:
+            Tensor: probability density. The data type is the same as `df`.
+        """
+        return paddle.exp(self.log_prob(value))
diff --git a/python/paddle/incubate/nn/functional/block_multihead_attention.py b/python/paddle/incubate/nn/functional/block_multihead_attention.py
index a55f61de2c678..596b9581570ad 100644
--- a/python/paddle/incubate/nn/functional/block_multihead_attention.py
+++ b/python/paddle/incubate/nn/functional/block_multihead_attention.py
@@ -389,3 +389,156 @@ def block_multihead_attention(
         },
     )
     return out, qkv, key_cache, value_cache
+
+
+def block_multihead_attention_xpu(
+    qkv,
+    key_cache,
+    value_cache,
+    seq_lens_encoder,
+    seq_lens_decoder,
+    seq_lens_this_time,
+    padding_offsets,
+    cum_offsets,
+    cu_seqlens_q,
+    cu_seqlens_k,
+    block_tables,
+    cache_k_per_batch_maxs,
+    cache_v_per_batch_maxs,
+    pre_key_cache=None,
+    pre_value_cache=None,
+    cache_k_quant_scales=None,
+    cache_v_quant_scales=None,
+    cache_k_dequant_scales=None,
+    cache_v_dequant_scales=None,
+    qkv_out_scale=None,
+    qkv_bias=None,
+    out_shift=None,
+    out_smooth=None,
+    max_enc_len_this_time=None,
+    max_dec_len_this_time=None,
+    rope_emb=None,
+    mask=None,
+    tgt_mask=None,
+    max_seq_len=-1,
+    block_size=64,
+    use_neox_style=False,
+    use_dynamic_cachekv_quant=False,
+    quant_round_type=1,
+    quant_max_bound=127.0,
+    quant_min_bound=-127.0,
+    out_scale=-1,
+    compute_dtype="default",
+):
+    if in_dynamic_mode():
+        return _C_ops.block_multihead_attention_xpu(
+            qkv,
+            key_cache,
+            value_cache,
+            seq_lens_encoder,
+            seq_lens_decoder,
+            seq_lens_this_time,
+            padding_offsets,
+            cum_offsets,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            block_tables,
+            cache_k_per_batch_maxs,
+            cache_v_per_batch_maxs,
+            pre_key_cache,
+            pre_value_cache,
+            rope_emb,
+            mask,
+            tgt_mask,
+            cache_k_quant_scales,
+            cache_v_quant_scales,
+            cache_k_dequant_scales,
+            cache_v_dequant_scales,
+            qkv_out_scale,
+            qkv_bias,
+            out_shift,
+            out_smooth,
+            max_enc_len_this_time,
+            max_dec_len_this_time,
+            max_seq_len,
+            block_size,
+            use_neox_style,
+            use_dynamic_cachekv_quant,
+            quant_round_type,
+            quant_max_bound,
+            quant_min_bound,
+            out_scale,
+            compute_dtype,
+        )
+
+    helper = LayerHelper('block_multihead_attention_xpu', **locals())
+    out = helper.create_variable_for_type_inference(dtype=qkv.dtype)
+
+    inputs = {}
+    inputs['qkv'] = qkv
+    inputs['key_cache'] = key_cache
+    inputs['value_cache'] = value_cache
+    inputs['seq_lens_encoder'] = seq_lens_encoder
+    inputs['seq_lens_decoder'] = seq_lens_decoder
+    inputs['seq_lens_this_time'] = seq_lens_this_time
+    inputs['padding_offsets'] = padding_offsets
+    inputs['cum_offsets'] = cum_offsets
+    inputs['cu_seqlens_q'] = cu_seqlens_q
+    inputs['cu_seqlens_k'] = cu_seqlens_k
+    inputs['block_tables'] = block_tables
+    inputs['cache_k_per_batch_maxs'] = cache_k_per_batch_maxs
+    inputs['cache_v_per_batch_maxs'] = cache_v_per_batch_maxs
+    if pre_key_cache is not None:
+        inputs['pre_key_cache'] = pre_key_cache
+    if pre_value_cache is not None:
+        inputs['pre_value_cache'] = pre_value_cache
+    if rope_emb is not None:
+        inputs['rope_emb'] = rope_emb
+    if mask is not None:
+        inputs['mask'] = mask
+    if tgt_mask is not None:
+        inputs['tgt_mask'] = tgt_mask
+    if cache_k_quant_scales is not None:
+        inputs["cache_k_quant_scales"] = cache_k_quant_scales
+    if cache_v_quant_scales is not None:
+        inputs["cache_v_quant_scales"] = cache_v_quant_scales
+    if cache_k_dequant_scales is not None:
+        inputs["cache_k_dequant_scales"] = cache_k_dequant_scales
+    if cache_v_dequant_scales is not None:
+        inputs["cache_v_dequant_scales"] = cache_v_dequant_scales
+    if qkv_out_scale is not None:
+        inputs["qkv_out_scale"] = qkv_out_scale
+    if qkv_bias is not None:
+        inputs["qkv_bias"] = qkv_bias
+    if out_shift is not None:
+        inputs["out_shift"] = out_shift
+    if out_smooth is not None:
+        inputs["out_smooth"] = out_smooth
+    if max_enc_len_this_time is not None:
+        inputs["max_enc_len_this_time"] = max_enc_len_this_time
+    if max_dec_len_this_time is not None:
+        inputs["max_dec_len_this_time"] = max_dec_len_this_time
+
+    outputs = {
+        'fmha_out': out,
+        'qkv_out': qkv,
+        'key_cache_out': key_cache,
+        'value_cache_out': value_cache,
+    }
+    helper.append_op(
+        type='block_multihead_attention_xpu',
+        inputs=inputs,
+        outputs=outputs,
+        attrs={
+            'max_seq_len': max_seq_len,
+            'block_size': block_size,
+            'use_neox_style': use_neox_style,
+            'dynamic_cachekv_quant': use_dynamic_cachekv_quant,
+            'quant_round_type': quant_round_type,
+            'quant_max_bound': quant_max_bound,
+            'quant_min_bound': quant_min_bound,
+            'out_scale': out_scale,
+            'compute_dtype': compute_dtype,
+        },
+    )
+    return out, qkv, key_cache, value_cache
diff --git a/python/paddle/jit/api.py b/python/paddle/jit/api.py
index 56a0d8a613be6..2367d5518ed92 100644
--- a/python/paddle/jit/api.py
+++ b/python/paddle/jit/api.py
@@ -337,14 +337,12 @@ def output_spec(self, spec):
             return
         if not isinstance(spec, list):
             raise TypeError(
-                "The config `output_spec` should be 'list', but received input type is %s."
-                % type(input)
+                f"The config `output_spec` should be 'list', but received input type is {type(input)}."
             )
             for var in spec:
                 if not isinstance(var, core.eager.Tensor):
                     raise TypeError(
-                        "The element in config `output_spec` list should be 'Variable', but received element's type is %s."
-                        % type(var)
+                        f"The element in config `output_spec` list should be 'Variable', but received element's type is {type(var)}."
                     )
         self._output_spec = spec
 
@@ -358,8 +356,7 @@ def model_filename(self, filename):
             return
         if not isinstance(filename, str):
             raise TypeError(
-                "The config `model_filename` should be str, but received input's type is %s."
-                % type(filename)
+                f"The config `model_filename` should be str, but received input's type is {type(filename)}."
             )
         if len(filename) == 0:
             raise ValueError("The config `model_filename` is empty string.")
@@ -375,8 +372,7 @@ def params_filename(self, filename):
             return
         if not isinstance(filename, str):
             raise TypeError(
-                "The config `params_filename` should be str, but received input's type is %s."
-                % type(filename)
+                f"The config `params_filename` should be str, but received input's type is {type(filename)}."
             )
         if len(filename) == 0:
             raise ValueError("The config `params_filename` is empty string.")
@@ -392,8 +388,7 @@ def keep_name_table(self, value):
             return
         if not isinstance(value, bool):
             raise TypeError(
-                "The config `keep_name_table` should be bool value, but received input's type is %s."
-                % type(value)
+                f"The config `keep_name_table` should be bool value, but received input's type is {type(value)}."
             )
         self._keep_name_table = value
 
@@ -413,8 +408,7 @@ def _parse_save_configs(configs):
     for key in configs:
         if key not in supported_configs:
             raise ValueError(
-                "The additional config (%s) of `paddle.jit.save` is not supported."
-                % (key)
+                f"The additional config ({key}) of `paddle.jit.save` is not supported."
             )
 
     # construct inner config
@@ -439,8 +433,7 @@ def _parse_load_config(configs):
     for key in configs:
         if key not in supported_configs:
             raise ValueError(
-                "The additional config (%s) of `paddle.jit.load` is not supported."
-                % (key)
+                f"The additional config ({key}) of `paddle.jit.load` is not supported."
             )
 
     # construct inner config
@@ -554,7 +547,7 @@ def _get_output_vars(outputs, output_spec, with_hook=False):
             output_size = len(result_list)
             if len(output_spec) == output_size:
                 for var in output_spec:
-                    if not isinstance(var, paddle.pir.Value, int):
+                    if not isinstance(var, (paddle.pir.Value, int)):
                         warnings.warn(output_spec_is_not_value_error % var.name)
                     else:
                         if var not in ValueSet(result_list):
@@ -636,9 +629,9 @@ def _build_load_path_and_config(path, config):
         )
     elif not prefix_format_exist and not directory_format_exist:
         raise ValueError(
-            "The ``path`` (%s) to load model not exists. "
+            f"The ``path`` ({path}) to load model not exists. "
             "Please make sure that *.pdmodel exists or "
-            "don't using ``skip_forward=True`` to jit.save." % path
+            "don't using ``skip_forward=True`` to jit.save."
         )
     else:
         if prefix_format_exist:
@@ -954,8 +947,7 @@ def save(layer, path, input_spec=None, **configs):
         isinstance(layer, (Layer, StaticFunction)) or inspect.isfunction(layer)
     ):
         raise TypeError(
-            "The input of paddle.jit.save should be 'Layer' or 'Function', but received input type is %s."
-            % type(layer)
+            f"The input of paddle.jit.save should be 'Layer' or 'Function', but received input type is {type(layer)}."
         )
     elif inspect.isfunction(layer) or isinstance(layer, StaticFunction):
         warnings.warn(
@@ -996,14 +988,12 @@ def save(layer, path, input_spec=None, **configs):
                     and 'forward' != attr_func
                 ):
                     raise ValueError(
-                        "If there are static functions other than 'forward' that need to be saved, the input 'input_spec' should be None, but received the type of 'input_spec' is %s."
-                        % type(input_spec)
+                        f"If there are static functions other than 'forward' that need to be saved, the input 'input_spec' should be None, but received the type of 'input_spec' is {type(input_spec)}."
                     )
 
         if not isinstance(input_spec, (list, tuple)):
             raise TypeError(
-                "The input input_spec should be 'list', but received input_spec's type is %s."
-                % type(input_spec)
+                f"The input input_spec should be 'list', but received input_spec's type is {type(input_spec)}."
             )
         inner_input_spec = []
         for var in paddle.utils.flatten(input_spec):
diff --git a/python/paddle/jit/dy2static/ast_utils.py b/python/paddle/jit/dy2static/ast_utils.py
index fc703dd6f6e49..7c4c90ec44d0e 100644
--- a/python/paddle/jit/dy2static/ast_utils.py
+++ b/python/paddle/jit/dy2static/ast_utils.py
@@ -27,8 +27,7 @@ def ast_to_source_code(ast_node):
     """
     if not isinstance(ast_node, (gast.AST, ast.AST)):
         raise TypeError(
-            "Type of ast_root should be gast.AST or ast.AST, but received %s."
-            % type(ast_node)
+            f"Type of ast_root should be gast.AST or ast.AST, but received {type(ast_node)}."
         )
     if isinstance(ast_node, gast.AST):
         ast_node = gast.gast_to_ast(ast_node)
diff --git a/python/paddle/jit/dy2static/convert_operators.py b/python/paddle/jit/dy2static/convert_operators.py
index 7ef8b4ce88736..10d2c9633ae80 100644
--- a/python/paddle/jit/dy2static/convert_operators.py
+++ b/python/paddle/jit/dy2static/convert_operators.py
@@ -615,8 +615,7 @@ def convert_len(var):
             return paddle.tensor.array_length(var)
         else:
             raise TypeError(
-                'len(var) only supports LoDTensor/LoDTensorArray/SelectedRows, but received %s.'
-                % type(var)
+                f'len(var) only supports LoDTensor/LoDTensorArray/SelectedRows, but received {type(var)}.'
             )
     elif isinstance(var, Value):
         if var.is_dense_tensor_type() or var.is_selected_row_type():
diff --git a/python/paddle/jit/dy2static/function_spec.py b/python/paddle/jit/dy2static/function_spec.py
index 7d5605f547df8..ce0b8382e9d01 100644
--- a/python/paddle/jit/dy2static/function_spec.py
+++ b/python/paddle/jit/dy2static/function_spec.py
@@ -179,7 +179,7 @@ def pir_to_static_inputs_with_spec(self, input_with_spec, main_program):
                 if isinstance(var_spec, paddle.static.InputSpec):
                     stop_gradient = getattr(var_spec, 'stop_gradient', False)
                     feed_value = paddle.static.input.data(
-                        name=var_spec.name or "feed_%s" % i,
+                        name=var_spec.name or f"feed_{i}",
                         shape=var_spec.shape,
                         dtype=convert_dtype(var_spec.dtype),
                     )
@@ -232,7 +232,7 @@ def to_static_inputs_with_spec(self, input_with_spec, main_program):
                 stop_gradient = getattr(var_spec, 'stop_gradient', False)
                 feed_layer = block.create_var(
                     # TODO(Aurelius84): consider a more elegant way to name this
-                    name=var_spec.name or "feed_%s" % i,
+                    name=var_spec.name or f"feed_{i}",
                     shape=var_spec.shape,
                     dtype=var_spec.dtype,
                     is_data=True,
diff --git a/python/paddle/jit/dy2static/partial_program.py b/python/paddle/jit/dy2static/partial_program.py
index 8571740db2659..f4fc6ea387f97 100644
--- a/python/paddle/jit/dy2static/partial_program.py
+++ b/python/paddle/jit/dy2static/partial_program.py
@@ -1108,8 +1108,7 @@ def _check_params_all_inited(self, main_program):
         """
         if not isinstance(self._params, (list, tuple)):
             raise TypeError(
-                "Type of self._params in PartialProgramLayer should be list or tuple, but received %s."
-                % type(self._params)
+                f"Type of self._params in PartialProgramLayer should be list or tuple, but received {type(self._params)}."
             )
 
         param_and_buffer_names_set = set()
@@ -1127,12 +1126,11 @@ def _check_params_all_inited(self, main_program):
                     if name not in param_and_buffer_names_set:
                         raise ValueError(
                             "\n\tWe don't support to define layer with parameters in the function decorated by `@to_static`."
-                            "\n\tBut we found parameter(%s) was created in the decorated function."
+                            f"\n\tBut we found parameter({name}) was created in the decorated function."
                             "\n"
                             "\n\tRevise suggestion: "
                             "\n\t\t1. Please ensure all your sublayers are inherited from nn.Layer."
                             "\n\t\t2. Please use nn.ParameterList and nn.LayerList as container instead of using a native Python container such as List"
-                            % name
                         )
 
     def _valid_vars(self, vars):
diff --git a/python/paddle/jit/dy2static/pir_partial_program.py b/python/paddle/jit/dy2static/pir_partial_program.py
index 55d8ab47e92a4..ff6ee46c8a1f9 100644
--- a/python/paddle/jit/dy2static/pir_partial_program.py
+++ b/python/paddle/jit/dy2static/pir_partial_program.py
@@ -1257,8 +1257,7 @@ def _check_params_all_inited(self, main_program):
         """
         if not isinstance(self._params, (list, tuple)):
             raise TypeError(
-                "Type of self._params in PartialProgramLayer should be list or tuple, but received %s."
-                % type(self._params)
+                f"Type of self._params in PartialProgramLayer should be list or tuple, but received {type(self._params)}."
             )
 
         param_and_buffer_names_set = set()
diff --git a/python/paddle/jit/dy2static/program_translator.py b/python/paddle/jit/dy2static/program_translator.py
index ea4040485b64a..d1a85626c17fc 100644
--- a/python/paddle/jit/dy2static/program_translator.py
+++ b/python/paddle/jit/dy2static/program_translator.py
@@ -1597,8 +1597,7 @@ def _build_once(self, cache_key):
     def __getitem__(self, item):
         if not isinstance(item, CacheKey):
             raise ValueError(
-                'type(item) should be CacheKey, but received %s'
-                % type_name(item)
+                f'type(item) should be CacheKey, but received {type_name(item)}'
             )
         item_id = hash(item)
         self._recent_cache_key = item
@@ -1621,8 +1620,7 @@ def get_program_without_cache(self, cache_key):
     def get_program(self, item):
         if not isinstance(item, CacheKey):
             raise ValueError(
-                "Input item's type should be FunctionSpec, but received %s"
-                % type_name(item)
+                f"Input item's type should be FunctionSpec, but received {type_name(item)}"
             )
         item_id = hash(item)
         if item_id not in self._caches:
diff --git a/python/paddle/jit/dy2static/transformers/early_return_transformer.py b/python/paddle/jit/dy2static/transformers/early_return_transformer.py
index 4dab1e5ab1638..ce8cf9e606878 100644
--- a/python/paddle/jit/dy2static/transformers/early_return_transformer.py
+++ b/python/paddle/jit/dy2static/transformers/early_return_transformer.py
@@ -36,9 +36,7 @@ def transform(self):
     def is_define_return_in_if(self, node):
         assert isinstance(
             node, gast.If
-        ), "Type of input node should be gast.If, but received %s ." % type(
-            node
-        )
+        ), f"Type of input node should be gast.If, but received {type(node)}."
         for child in node.body:
             if isinstance(child, gast.Return):
                 return True
diff --git a/python/paddle/jit/dy2static/utils.py b/python/paddle/jit/dy2static/utils.py
index ad195befba4b5..03a2cd06d3211 100644
--- a/python/paddle/jit/dy2static/utils.py
+++ b/python/paddle/jit/dy2static/utils.py
@@ -204,7 +204,7 @@ def make_hashable(x, error_msg=None):
             return tuple(map(make_hashable, x.values()))
 
         error_msg = error_msg or "Requires a hashable object."
-        raise ValueError(error_msg + " But received type: %s" % type_name(x))
+        raise ValueError(f"{error_msg} But received type: {type_name(x)}")
 
     return x
 
@@ -327,8 +327,7 @@ def func_prefix(func):
         callable_func = getattr(module, func_name)
     else:
         raise ValueError(
-            'Function: %s doesn\'t exist in the Module transformed from AST.'
-            % func_name
+            f'Function: {func_name} doesn\'t exist in the Module transformed from AST.'
         )
     # After transform dygraph function into callable_func saved in tmp file,
     # it lost the global variables from imported statements or defined in source file.
diff --git a/python/paddle/jit/pir_translated_layer.py b/python/paddle/jit/pir_translated_layer.py
index 8a6e3ede35e2a..6bdf8f2952d8d 100644
--- a/python/paddle/jit/pir_translated_layer.py
+++ b/python/paddle/jit/pir_translated_layer.py
@@ -217,7 +217,6 @@ def _load_pir_parameter_vars(model_path, program_holder, params_filename):
     # load all vars
     assert params_filename is not None, "params_filename should not be None."
     var_file_path = os.path.join(model_path, params_filename)
-
     if os.path.exists(var_file_path):
         core.load_combine_func(
             var_file_path,
@@ -228,8 +227,7 @@ def _load_pir_parameter_vars(model_path, program_holder, params_filename):
         )
     else:
         raise ValueError(
-            "The file %s does not exist. Please check the model path."
-            % var_file_path
+            f"The file {var_file_path} does not exist. Please check the model path."
         )
 
     load_var_dict.update(other_var_dict)
@@ -328,8 +326,7 @@ def _run_dygraph(instance, input, program_holder):
     for i, value in enumerate(input):
         if not isinstance(value, (np.ndarray, core.eager.Tensor)):
             raise TypeError(
-                "The type of input in PirTranslatedLayer must be numpy array or Variable(Tensor), but received %s."
-                % type(value)
+                f"The type of input in PirTranslatedLayer must be numpy array or Variable(Tensor), but received {type(value)}."
             )
         # NOTE: In order to unify the API, firstly convert the input to Tensor
         if isinstance(value, np.ndarray):
@@ -361,8 +358,7 @@ def _run_dygraph(instance, input, program_holder):
             persistable_tensors.append(instance._buffers[dy_var_name])
         else:
             raise ValueError(
-                "The persistable variable %s does not exist in current PirTranslatedLayer."
-                % var_name
+                f"The persistable variable {var_name} does not exist in current PirTranslatedLayer."
             )
 
     from paddle.jit.dy2static.pir_partial_program import PartialProgramLayer
@@ -378,7 +374,6 @@ def _run_dygraph(instance, input, program_holder):
         parameters,
     )
     instance.layer = layer
-
     if instance._is_test:
         layer.training = False
     else:
@@ -392,9 +387,42 @@ def _run_dygraph(instance, input, program_holder):
     return instance.layer(input_tensors)
 
 
-def _run_static_graph(program_holder, trace_program):
-    paddle.base.framework.switch_main_program(trace_program)
-    return program_holder.output_vars
+def _run_static_graph(inputs, program_holder, src_program):
+    '''
+    This function is used when the pirTranslatedLayer is
+    applied for dy_to_static conversion.
+    '''
+    dst_program = paddle.static.default_main_program()
+    value_map = paddle.pir.IrMapping()
+    # Establish a mapping relationship between existing parameters
+    # and corresponding parameters in the program to be copied
+    len_dst_op = len(dst_program.global_block().ops)
+    for dst_op in dst_program.global_block().ops:
+        if dst_op.name() == "builtin.parameter":
+            for src_op in src_program.global_block().ops[:len_dst_op]:
+                if (
+                    src_op.name() == dst_op.name()
+                    and src_op.result(0).name == dst_op.result(0).name
+                ):
+                    for i in range(src_op.num_results()):
+                        value_map.add(src_op.result(i), dst_op.result(i))
+    # Establish a mapping relationship between truly inputs
+    # and corresponding inputs in the program to be copied
+    src_inputs = program_holder.input_vars
+    if len(src_inputs) != len(inputs):
+        raise ValueError(
+            f"The number of input is invalid, expected {len(src_inputs)}, but received {len(inputs)}."
+        )
+    for src_input, input_ in zip(src_inputs, inputs):
+        value_map.add(src_input, input_)
+
+    # find the insert point for copy
+    current_insert_point = paddle.pir.get_current_insertion_point()
+    current_block = current_insert_point.block()
+    src_program.copy_to_block(value_map, current_block)
+
+    output = [value_map.look_up(v) for v in program_holder.output_vars]
+    return output[0] if len(output) == 1 else output
 
 
 def _collect_current_and_parent_var(program, block_idx):
@@ -561,7 +589,7 @@ def _construct(model_path, configs=None):
         # 0. dir and filename check
         model_path = os.path.normpath(model_path)
         if not os.path.isdir(model_path):
-            raise ValueError("There is no directory named '%s'" % model_path)
+            raise ValueError(f"There is no directory named '{model_path}'")
         model_filename = None
         params_filename = None
         if configs is not None:
@@ -608,7 +636,7 @@ def __i_m_p_l__(self, *input):
                 return _run_dygraph(self, input, program_holder)
             else:
                 return _run_static_graph(
-                    program_holder, program_holder.infer_program
+                    input, program_holder, program_holder.infer_program
                 )
 
         __i_m_p_l__.__name__ = method_name
@@ -719,8 +747,7 @@ def _get_program_holder(self, method_name='forward'):
         program_holder = self._program_holder_dict.get(method_name, None)
         if program_holder is None:
             raise ValueError(
-                "The method `%s` does not exist in loaded PirTranslatedLayer."
-                % method_name
+                f"The method `{method_name}` does not exist in loaded PirTranslatedLayer."
             )
         return program_holder
 
diff --git a/python/paddle/jit/sot/infer_meta.py b/python/paddle/jit/sot/infer_meta.py
index 3ec9f0d891c9e..a67b10c27105f 100644
--- a/python/paddle/jit/sot/infer_meta.py
+++ b/python/paddle/jit/sot/infer_meta.py
@@ -11,8 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
 
 from functools import cached_property
+from typing import TypeVar
 
 import paddle
 from paddle.amp.auto_cast import amp_state
@@ -26,10 +28,32 @@
 
 from .utils import Cache, Singleton, map_if_extend, meta_str
 
+DynamicSymbolT = TypeVar("DynamicSymbolT")
+
+
+class SymbolicInt(metaclass=Singleton):
+    def __eq__(self, other) -> bool:
+        return isinstance(other, (int, SymbolicInt))
+
+    def __repr__(self) -> str:
+        return "SymbolicInt()"
+
+    def __str__(self) -> str:
+        return "SymbolicInt()"
+
 
 class MetaInfo:
     def __init__(
-        self, shape, dtype, stop_gradient, name, persistable, type, place
+        self,
+        shape,
+        dtype,
+        stop_gradient,
+        name,
+        persistable,
+        type,
+        place,
+        *,
+        dynamic_axes: list[int] | None = None,
     ):
         self.name = name
         self.persistable = persistable
@@ -38,9 +62,18 @@ def __init__(
         self.shape = shape
         self.dtype = dtype
         self.stop_gradient = stop_gradient
+        self.dynamic_axes = dynamic_axes or []
+
+    def get_dynamic_shape(
+        self, dynamic_symbol: DynamicSymbolT = -1
+    ) -> list[int | DynamicSymbolT]:
+        return [
+            dim if i not in self.dynamic_axes else dynamic_symbol
+            for i, dim in enumerate(self.shape)
+        ]
 
     @staticmethod
-    def from_tensor(tensor):
+    def from_tensor(tensor, *, dynamic_axes: list[int] | None = None):
         if isinstance(tensor, paddle.pir.Value):
             name = "Value@NoName"
         else:  # For Tensor or Variable
@@ -54,6 +87,7 @@ def from_tensor(tensor):
         )
         assert isinstance(dtype, expected_dtype_class)
 
+        # TODO(@xiongkun) remove after pir become default state.
         # We always use float32 in simulation if AMP is enabled.
         current_amp_state = amp_state()
         if (
@@ -63,7 +97,12 @@ def from_tensor(tensor):
             and current_amp_state["dtype"] == "float16"
         ):
             dtype = paddle.float32
-        # TODO(@xiongkun) remove after pir become default state.
+        dynamic_axes = dynamic_axes or []
+        dynamic_axes = [
+            i
+            for i, dim in enumerate(tensor.shape)
+            if dim == -1 or i in dynamic_axes
+        ]
         return MetaInfo(
             list(tensor.shape),
             dtype,
@@ -72,6 +111,7 @@ def from_tensor(tensor):
             persistable,
             tensor.type,
             tensor.place,
+            dynamic_axes=dynamic_axes,
         )
 
     def is_dynamic_shape(self):
@@ -82,12 +122,14 @@ def is_dynamic_shape(self):
         return -1 in self.shape
 
     def to_input_spec(self):
+        shape = self.get_dynamic_shape(None)
         return paddle.static.InputSpec(
-            self.shape, dtype=self.dtype, stop_gradient=self.stop_gradient
+            shape, dtype=self.dtype, stop_gradient=self.stop_gradient
         )
 
     def guard_str(self):
-        return f"({self.shape}, {self.dtype}, {self.stop_gradient})"
+        shape = self.get_dynamic_shape(SymbolicInt())
+        return f"({shape}, {self.dtype}, {self.stop_gradient})"
 
     def __repr__(self):
         return meta_str(self.shape, self.dtype, self.stop_gradient)
@@ -161,20 +203,22 @@ def startup_program(self):
         else:
             return self.legacy_programs[1]
 
-    def create_var(self, meta):
+    def create_var(self, meta: MetaInfo):
+        shape = meta.get_dynamic_shape()
+
         if paddle.framework.use_pir_api():
             with paddle.static.program_guard(
                 self.main_program, self.startup_program
             ):
                 var = paddle.static.input.data(
                     name=self.gen_name(meta),
-                    shape=meta.shape,
+                    shape=shape,
                     dtype=convert_dtype(meta.dtype),
                 )
                 var.stop_gradient = meta.stop_gradient
         else:
             var = self.main_program.global_block().create_var(
-                shape=meta.shape,
+                shape=shape,
                 dtype=meta.dtype,
                 stop_gradient=meta.stop_gradient,
             )
@@ -193,9 +237,10 @@ def infer_meta(self, func, *args, **kwargs):
         with paddle.base.framework._dygraph_guard(None), UniqueNameGuard(
             self.var_name_generator
         ):
-            args, kwargs = convert_meta_to_variable(
-                args
-            ), convert_meta_to_variable(kwargs)
+            args, kwargs = (
+                convert_meta_to_variable(args),
+                convert_meta_to_variable(kwargs),
+            )
 
             with paddle.static.program_guard(
                 self.main_program, self.startup_program
@@ -225,9 +270,11 @@ def convert_meta_to_input_spec(args):
         pred=lambda x: isinstance(x, MetaInfo),
         true_fn=lambda x: x.to_input_spec(),
         # TODO(xiongkun): can x be tensor ?
-        false_fn=lambda x: paddle.static.InputSpec.from_tensor(x)
-        if isinstance(x, paddle.Tensor)
-        else x,
+        false_fn=lambda x: (
+            paddle.static.InputSpec.from_tensor(x)
+            if isinstance(x, paddle.Tensor)
+            else x
+        ),
     )
 
 
diff --git a/python/paddle/jit/sot/opcode_translator/executor/executor_cache.py b/python/paddle/jit/sot/opcode_translator/executor/executor_cache.py
index f94884d0c118b..bbefddda639ad 100644
--- a/python/paddle/jit/sot/opcode_translator/executor/executor_cache.py
+++ b/python/paddle/jit/sot/opcode_translator/executor/executor_cache.py
@@ -56,12 +56,16 @@ class OpcodeExecutorCache(metaclass=Singleton):
     MAX_CACHE_SIZE = 20
     cache: dict[types.CodeType, GuardedFunctions]
     translate_count: int
-    symbolic_inputs: dict[str, dict[int, int]]
+    code_symbolic_inputs: dict[types.CodeType, dict[str, dict[int, int]]]
 
     def __init__(self):
         self.cache = {}
         self.translate_count = 0
-        self.symbolic_inputs = {}
+        self.code_symbolic_inputs = {}
+
+    def get_symbolic_inputs(self, code: types.CodeType):
+        self.code_symbolic_inputs.setdefault(code, {})
+        return self.code_symbolic_inputs[code]
 
     def clear(self):
         """
@@ -69,6 +73,7 @@ def clear(self):
         """
         self.cache.clear()
         self.translate_count = 0
+        self.code_symbolic_inputs.clear()
 
     def __call__(self, frame: types.FrameType, **kwargs) -> CustomCode:
         code: types.CodeType = frame.f_code
diff --git a/python/paddle/jit/sot/opcode_translator/executor/function_graph.py b/python/paddle/jit/sot/opcode_translator/executor/function_graph.py
index 99ea75ebbcd48..93de3c8dfe815 100644
--- a/python/paddle/jit/sot/opcode_translator/executor/function_graph.py
+++ b/python/paddle/jit/sot/opcode_translator/executor/function_graph.py
@@ -22,9 +22,9 @@
 from collections import namedtuple
 from copy import deepcopy
 from functools import cached_property
-from typing import Any, Callable
+from typing import Any, Callable, Tuple, Union
 
-from typing_extensions import TypeGuard
+from typing_extensions import TypeAlias, TypeGuard
 
 import paddle
 from paddle.jit.utils import OrderedSet
@@ -37,7 +37,7 @@
     ast_infer_meta,
 )
 from ...profiler import EventGuard, event_register
-from ...symbolic.statement_ir import Reference, Symbol
+from ...symbolic.statement_ir import Reference, StatementIR, Symbol
 from ...symbolic.symbolic_context import SymbolicTraceContext
 from ...utils import (
     NameGenerator,
@@ -81,6 +81,15 @@
     map_variables,
 )
 
+CompileGraphResult: TypeAlias = Tuple[
+    Callable[..., Any],
+    Tuple[
+        StatementIR,
+        OrderedSet[Union[TensorVariable, SymbolicVariable]],
+        OrderedSet[Union[TensorVariable, SymbolicVariable]],
+    ],
+]
+
 
 def convert_to_meta(inputs: Any):
     """
@@ -329,7 +338,7 @@ def _restore_origin_opcode(self, stack_vars, store_var_info, instr_idx):
 
         self.pycode_gen.gen_enable_eval_frame()
 
-        name_gen = NameGenerator("__start_compile_saved_orig_")
+        name_gen = NameGenerator("___compile_fn_saved_orig_")
 
         # here is not update changed values, it just give names to stack vars
         # and want keep same interface as _build_compile_fn_with_name_store
@@ -344,13 +353,18 @@ def _restore_origin_opcode(self, stack_vars, store_var_info, instr_idx):
 
         return VariableLoader(store_var_info, self.pycode_gen)
 
-    def _build_compile_fn_with_name_store(self, to_store_vars, store_var_info):
+    def _build_compile_fn_with_name_store(
+        self,
+        compile_graph_result: CompileGraphResult,
+        to_store_vars,
+        store_var_info,
+    ):
         # var_id -> local_name mapping
         to_store_vars = list(
             filter(lambda x: not isinstance(x, NullVariable), to_store_vars)
         )
-        self.start_compile(*to_store_vars)
-        name_gen = NameGenerator("__start_compile_saved_")
+        self.compile_function(compile_graph_result, to_store_vars)
+        name_gen = NameGenerator("___compile_fn_saved_")
 
         for var in to_store_vars[::-1]:
             if store_var_info[var.id] is None:
@@ -363,23 +377,38 @@ def _build_compile_fn_with_name_store(self, to_store_vars, store_var_info):
 
         return VariableLoader(store_var_info, self.pycode_gen)
 
-    def get_compiled_fn(self, *ret_vars):
+    def compile_graph(self, *ret_vars: VariableBase) -> CompileGraphResult:
         ret_items = [
             ret_item
             for ret_var in ret_vars
             for ret_item in ret_var.flatten_items()
         ]
 
-        tensor_items = self._find_tensor_outputs(ret_items)
-        compiled_fn, _ = self.sir_ctx.compile_fn(
-            [Symbol(tensor_var.var_name) for tensor_var in tensor_items],
+        symbolic_outputs = self._find_tensor_outputs(ret_items)
+        statement_ir = self.sir_ctx.return_TOS(
+            [Symbol(tensor_var.var_name) for tensor_var in symbolic_outputs]
+        )
+        if not statement_ir.statements:
+            return self.sir_ctx.compile_do_nothing(), (
+                statement_ir,
+                OrderedSet(),
+                OrderedSet(),
+            )
+        input_names = statement_ir.inputs
+        symbolic_inputs = self._find_tensor_inputs(input_names)
+        compiled_fn = self.sir_ctx.compile_fn(
+            statement_ir.name,
+            [var.meta.to_input_spec() for var in symbolic_inputs],
             **self._kwargs,
         )
+        return compiled_fn, (statement_ir, symbolic_inputs, symbolic_outputs)
 
-        return compiled_fn
-
-    @event_register("start_compile", event_level=2)
-    def start_compile(self, *ret_vars: VariableBase):
+    @event_register("compile_function", event_level=2)
+    def compile_function(
+        self,
+        compile_graph_result: CompileGraphResult,
+        ret_vars: list[VariableBase],
+    ):
         """
         Generate bytecode based on the information collected by the simulation execution.
 
@@ -393,48 +422,24 @@ def start_compile(self, *ret_vars: VariableBase):
         """
         from ..breakpoint import BreakpointManager
 
-        BreakpointManager().on_event("start_compile")
-
-        ret_items = [
-            ret_item
-            for ret_var in ret_vars
-            for ret_item in ret_var.flatten_items()
-        ]
-
-        tensor_items = self._find_tensor_outputs(ret_items)
-        compiled_fn, statement_ir = self.sir_ctx.compile_fn(
-            [Symbol(tensor_var.var_name) for tensor_var in tensor_items],
-            **self._kwargs,
-        )
-        input_names = statement_ir.inputs
-        compiled_fn_name = f"__compiled_fn_{statement_ir.name}"
+        BreakpointManager().on_event("compile_function")
+        graph_fn, (
+            statement_ir,
+            symbolic_inputs,
+            symbolic_outputs,
+        ) = compile_graph_result
+        compiled_fn_name = f"___graph_fn_{statement_ir.name}"
         # prepare function and inputs
-        self.pycode_gen.gen_load_object(compiled_fn, compiled_fn_name)
-        for name in input_names:
-            found = False
-            for variable in self.input_variables:
-                if (
-                    isinstance(variable, (TensorVariable, SymbolicVariable))
-                    and variable.get_symbol().name == name
-                ):
-                    if isinstance(variable, SymbolicVariable):
-                        self.pycode_gen.gen_load_object(
-                            paddle.to_tensor, "___paddle_to_tensor"
-                        )
-                    variable.tracker.gen_instructions(self.pycode_gen)
-                    found = True
-                    if isinstance(variable, SymbolicVariable):
-                        self.pycode_gen.gen_call_function(1)
-                    break
-            assert found, f"can't find input {name} in SIR."
+        self.pycode_gen.gen_load_object(graph_fn, compiled_fn_name)
+        self.gen_load_inputs(symbolic_inputs)
         # Pack all args into a tuple, because we don't support *args now.
-        self.pycode_gen.gen_build_tuple(count=len(input_names))
-        # call the compiled_fn
+        self.pycode_gen.gen_build_tuple(count=len(symbolic_inputs))
+        # call the graph_fn
         self.pycode_gen.gen_call_function(argc=1)
 
         # Store outputs to f_locals
-        self.pycode_gen.gen_unpack_sequence(count=len(tensor_items))
-        for tensor_var in tensor_items:
+        self.pycode_gen.gen_unpack_sequence(count=len(symbolic_outputs))
+        for tensor_var in symbolic_outputs:
             self.pycode_gen.gen_store_fast(tensor_var.out_var_name)
         # restore the outputs.
         for ret_var in ret_vars:
@@ -725,6 +730,36 @@ def remove_global_guarded_variable(self, variable: VariableBase):
         if variable in self._global_guarded_variables:
             self._global_guarded_variables.remove(variable)
 
+    def _find_tensor_inputs(
+        self, input_names: list[str]
+    ) -> OrderedSet[TensorVariable | SymbolicVariable]:
+        inputs: OrderedSet[TensorVariable | SymbolicVariable] = OrderedSet()
+        for name in input_names:
+            found = False
+            for variable in self.input_variables:
+                if (
+                    isinstance(variable, (TensorVariable, SymbolicVariable))
+                    and variable.get_symbol().name == name
+                ):
+                    inputs.add(variable)
+                    found = True
+                    break
+            assert found, f"can't find input {name} in SIR."
+        assert len(inputs) == len(input_names), "Number of inputs not match."
+        return inputs
+
+    def gen_load_inputs(
+        self, inputs: OrderedSet[TensorVariable | SymbolicVariable]
+    ):
+        for input_var in inputs:
+            if isinstance(input_var, SymbolicVariable):
+                self.pycode_gen.gen_load_object(
+                    paddle.to_tensor, "___paddle_to_tensor"
+                )
+            input_var.tracker.gen_instructions(self.pycode_gen)
+            if isinstance(input_var, SymbolicVariable):
+                self.pycode_gen.gen_call_function(1)
+
     def _find_tensor_outputs(
         self, outputs: list[VariableBase]
     ) -> OrderedSet[TensorVariable | SymbolicVariable]:
@@ -738,12 +773,14 @@ def _find_tensor_outputs(
         def is_graph_output(
             var,
         ) -> TypeGuard[TensorVariable | SymbolicVariable]:
-            return isinstance(var.tracker, DummyTracker) and isinstance(
-                var, (TensorVariable, SymbolicVariable)
-            )
+            return isinstance(
+                var.tracker, (DummyTracker, SymbolicOperationTracker)
+            ) and isinstance(var, (TensorVariable, SymbolicVariable))
 
         def collect_related_dummy_tensor(var):
-            if isinstance(var.tracker, DummyTracker):
+            if isinstance(
+                var.tracker, (DummyTracker, SymbolicOperationTracker)
+            ):
                 if is_graph_output(var):
                     return [var]
                 else:
@@ -758,7 +795,9 @@ def collect_related_dummy_tensor(var):
         ] = OrderedSet()
         # Find Tensor Variables from outputs.
         for output in outputs:
-            if isinstance(output.tracker, DummyTracker):
+            if isinstance(
+                output.tracker, (DummyTracker, SymbolicOperationTracker)
+            ):
                 if is_graph_output(output):
                     output_tensors.add(output)
                 else:
@@ -809,7 +848,7 @@ def restore_print_stmts(self, variables: list[VariableBase]):
                 add_to_global_guarded_vars=False,
             )
 
-    def restore_inplace_tensor(self, variables: list[VariableBase]):
+    def restore_inplace_tensor(self, variables: OrderedSet[VariableBase]):
         for var in variables:
             if not var.tracker.is_traceable():
                 continue
diff --git a/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py b/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py
index 70870913a6a02..3146609a595b0 100644
--- a/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py
+++ b/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py
@@ -1737,11 +1737,12 @@ def RETURN_CONST(self, instr: Instruction):
         return self.compile_return(ret_const)
 
     def compile_return(self, ret_val):
-        compile_fn = self._graph.get_compiled_fn(ret_val)
-        if compile_fn.graph_size() < ENV_MIN_GRAPH_SIZE.get():
+        compile_graph_result = self._graph.compile_graph(ret_val)
+        graph_fn, _ = compile_graph_result
+        if graph_fn.graph_size() < ENV_MIN_GRAPH_SIZE.get():
             self.new_code = None
         else:
-            self._graph.start_compile(ret_val)
+            self._graph.compile_function(compile_graph_result, [ret_val])
             self._graph.pycode_gen.gen_return()
             self.new_code = self._graph.pycode_gen.gen_pycode()
         self.guard_fn = self._graph.guard_fn
@@ -1775,15 +1776,16 @@ def get_compute_fn_and_update_changed_vars(
                 store_vars.append(_var)
             store_var_info[_var.id] = name
 
-        compile_fn = self._graph.get_compiled_fn(*store_vars)
+        compile_graph_result = self._graph.compile_graph(*store_vars)
+        graph_fn, _ = compile_graph_result
 
-        if compile_fn.graph_size() < ENV_MIN_GRAPH_SIZE.get():
+        if graph_fn.graph_size() < ENV_MIN_GRAPH_SIZE.get():
             return self._graph._restore_origin_opcode(
                 list(stack), store_var_info, end_idx
             )
         else:
             return self._graph._build_compile_fn_with_name_store(
-                store_vars, store_var_info
+                compile_graph_result, store_vars, store_var_info
             )
 
     @fallback_when_occur_error
diff --git a/python/paddle/jit/sot/opcode_translator/executor/tracker.py b/python/paddle/jit/sot/opcode_translator/executor/tracker.py
index 41ce17dba7cbc..85a7f68f6847a 100644
--- a/python/paddle/jit/sot/opcode_translator/executor/tracker.py
+++ b/python/paddle/jit/sot/opcode_translator/executor/tracker.py
@@ -127,7 +127,7 @@ def need_guard(self) -> bool:
         return False
 
 
-class SymbolicOperationTracker(DummyTracker):
+class SymbolicOperationTracker(Tracker):
     """
     SymbolicOperationTracker is a subclass of Tracker that specifically tracks variables cannot be reproduced from the frame.
     It is mostly generated by complex operations of symbolic variables.
@@ -151,6 +151,14 @@ def trace_value_from_frame(self):
     def __repr__(self) -> str:
         return f"SymbolicOperationTracker(num_inputs={len(self.inputs)})"
 
+    def is_traceable(self):
+        # TODO(zrr1999): to implement gen_instructions and trace_value_from_frame
+        return False
+
+    def need_guard(self) -> bool:
+        # TODO(zrr1999): to implement gen_instructions and trace_value_from_frame
+        return False
+
 
 class DanglingTracker(Tracker):
     """
diff --git a/python/paddle/jit/sot/opcode_translator/executor/variables/basic.py b/python/paddle/jit/sot/opcode_translator/executor/variables/basic.py
index 965b7edba28ed..ffec4b1485cb6 100644
--- a/python/paddle/jit/sot/opcode_translator/executor/variables/basic.py
+++ b/python/paddle/jit/sot/opcode_translator/executor/variables/basic.py
@@ -90,6 +90,8 @@
     core.DataType.BOOL: "bool",
 }
 
+STATIC_DIM_FREQ_THRESHOLD = 5
+
 
 class ConstantVariable(VariableBase):
     """
@@ -174,24 +176,6 @@ def chr(self):
             DummyTracker([self]),
         )
 
-    @check_guard
-    def make_stringify_guard(self) -> list[StringifyExpression]:
-        if (
-            ENV_SOT_ALLOW_DYNAMIC_SHAPE.get()
-            and isinstance(self.value, int)
-            and self.tracker.need_guard()
-        ):
-            from ..executor_cache import OpcodeExecutorCache
-
-            frame_value_tracer = self.tracker.trace_value_from_frame()
-            symbolic_inputs = OpcodeExecutorCache().symbolic_inputs
-            symbolic_inputs.setdefault(frame_value_tracer.inlined_expr, {})
-            symbolic_input = symbolic_inputs[frame_value_tracer.inlined_expr]
-            symbolic_input.setdefault(self.value, 0)
-            symbolic_input[self.value] += 1
-
-        return super().make_stringify_guard()
-
     @VariableFactory.register_from_value()
     def from_value(value: Any, graph: FunctionGraph, tracker: Tracker):
         if type(value) in ConstTypes:
@@ -349,10 +333,24 @@ def __init__(
             raise InnerError(
                 f"Required type(tensor) is paddle.Tensor or ProxyTensor, but received {type(tensor).__name__}."
             )
+        dynamic_axes: list[int] = []
+        if ENV_SOT_ALLOW_DYNAMIC_SHAPE.get() and self.tracker.is_traceable():
+            dynamic_axes = self.analyse_dynamic_axes()
+        self.meta.dynamic_axes = dynamic_axes
         self.origin_meta = self.meta
         self.var_name = TensorVariable.var_name_generator.next()
         self.graph.side_effects.record_mutable_variable(self)
 
+    def analyse_dynamic_axes(self):
+        shape_dims = (
+            self.shape.proxy.get_all()
+        )  # Trigger convert all shape dims to Variable
+        return [
+            i
+            for i, dim in enumerate(shape_dims)
+            if isinstance(dim, SymbolicVariable)
+        ]
+
     def __len__(self):
         if self.meta.shape[0] == -1:
             raise BreakGraphError(
@@ -399,9 +397,13 @@ def _reconstruct(self, codegen: PyCodeGen):
     def make_stringify_guard(self) -> list[StringifyExpression]:
         frame_value_tracer = self.tracker.trace_value_from_frame()
 
+        if ENV_SOT_ALLOW_DYNAMIC_SHAPE.get():
+            str_left_expr = f"MetaInfo.from_tensor({{}}, dynamic_axes={self.meta.dynamic_axes}).guard_str()"
+        else:
+            str_left_expr = "MetaInfo.from_tensor({}).guard_str()"
         return [
             StringifyExpression(
-                f"MetaInfo.from_tensor({{}}).guard_str() == '{self.origin_meta.guard_str()}'",
+                f"{str_left_expr} == '{self.origin_meta.guard_str()}'",
                 [frame_value_tracer],
                 union_free_vars(
                     {"MetaInfo": MetaInfo},
@@ -483,15 +485,15 @@ def size(self):
 
     @tensor_property
     def shape(self):
+        # TODO(zrr1999): support more tensor properties
         if self.meta.is_dynamic_shape():
             raise BreakGraphError(
                 f"Getting shape for a dynamic shape tensor causes graph break. shape = {self.meta.shape}"
             )
         from .container import ListVariable
 
-        return ListVariable(
-            self.meta.shape, self.graph, tracker=DummyTracker([self])
-        )
+        tracker = GetAttrTracker(self, "shape")
+        return ListVariable(self.meta.shape, self.graph, tracker=tracker)
 
     def numel(self):
         return self.size
@@ -605,7 +607,7 @@ class SymbolicVariable(VariableBase):
 
     def __init__(
         self,
-        value: int | MetaInfo,
+        value: int | None | MetaInfo,
         graph: FunctionGraph,
         tracker: Tracker,
     ):
@@ -663,7 +665,9 @@ def make_stringify_guard(self) -> list[StringifyExpression]:
         from ..executor_cache import OpcodeExecutorCache
 
         frame_value_tracer = self.tracker.trace_value_from_frame()
-        symbolic_inputs = OpcodeExecutorCache().symbolic_inputs
+        symbolic_inputs = OpcodeExecutorCache().get_symbolic_inputs(
+            self.graph.pycode_gen._origin_code
+        )
 
         assert frame_value_tracer.inlined_expr in symbolic_inputs
 
@@ -681,25 +685,42 @@ def make_stringify_guard(self) -> list[StringifyExpression]:
             )
         ]
 
+    @staticmethod
+    def should_create_symbolic_variable(
+        value: Any, tracker: Tracker, symbolic_inputs: dict[str, dict[int, int]]
+    ):
+        tracker_expr = tracker.trace_value_from_frame().inlined_expr
+        symbolic_inputs.setdefault(tracker_expr, {})
+        for expr, symbolic_input in symbolic_inputs.items():
+            if tracker.match_expr(expr):
+                symbolic_input.setdefault(value, 0)
+                symbolic_input[value] += 1
+                if symbolic_input[value] >= STATIC_DIM_FREQ_THRESHOLD:
+                    return False
+                if len(symbolic_input.keys()) > 1:
+                    return True
+                return False
+        return False
+
     @VariableFactory.register_from_value(successor="ConstantVariable")
     def from_value(value: Any, graph: FunctionGraph, tracker: Tracker):
         if not ENV_SOT_ALLOW_DYNAMIC_SHAPE.get():
-            return
+            return None
         if not isinstance(value, int):
-            return
-        if not tracker.need_guard():
-            return
+            return None
+        if not tracker.is_traceable():
+            return None
 
         from ..executor_cache import OpcodeExecutorCache
 
-        symbolic_inputs = OpcodeExecutorCache().symbolic_inputs
+        symbolic_inputs = OpcodeExecutorCache().get_symbolic_inputs(
+            graph.pycode_gen._origin_code
+        )
 
-        for tracker_expr, symbolic_input in symbolic_inputs.items():
-            if tracker.match_expr(tracker_expr):
-                symbolic_input.setdefault(value, 0)
-                symbolic_input[value] += 1
-                # TODO(zrr1999): determine frequency
-                return SymbolicVariable(value, graph, tracker)
+        if SymbolicVariable.should_create_symbolic_variable(
+            value, tracker, symbolic_inputs
+        ):
+            return SymbolicVariable(value, graph, tracker)
         return None
 
 
diff --git a/python/paddle/jit/sot/symbolic/compile_cache.py b/python/paddle/jit/sot/symbolic/compile_cache.py
index b697e721532f9..5cb06059bb3db 100644
--- a/python/paddle/jit/sot/symbolic/compile_cache.py
+++ b/python/paddle/jit/sot/symbolic/compile_cache.py
@@ -21,6 +21,7 @@
 from paddle.amp.auto_cast import amp_state
 from paddle.base.data_feeder import convert_dtype
 from paddle.framework import _dygraph_tracer, use_pir_api
+from paddle.static import InputSpec
 
 from ..infer_meta import convert_meta_to_input_spec
 from ..profiler import EventGuard
@@ -162,7 +163,13 @@ class CompileSIRCache(Cache, metaclass=Singleton):
     def __init__(self):
         super().__init__(weak=False)
 
-    def key_fn(self, context: SymbolicTraceContext, sir_name: str, **kwargs):
+    def key_fn(
+        self,
+        context: SymbolicTraceContext,
+        sir_name: str,
+        input_spec: list[InputSpec],
+        **kwargs,
+    ):
         """
         generate a hash key for a SIR
 
@@ -176,10 +183,16 @@ def key_fn(self, context: SymbolicTraceContext, sir_name: str, **kwargs):
         """
         sir = context.get_sir(sir_name)
         # NOTE(dev): Is str(sir) a heavy operation ?
-        hash_key = hash((str(sir), kwargs['training']))
+        hash_key = hash((str(sir), *input_spec, kwargs['training']))
         return hash_key
 
-    def value_fn(self, context: SymbolicTraceContext, sir_name: str, **kwargs):
+    def value_fn(
+        self,
+        context: SymbolicTraceContext,
+        sir_name: str,
+        input_spec: list[InputSpec],
+        **kwargs,
+    ):
         """
         Generate static graph function
 
@@ -196,6 +209,7 @@ def value_fn(self, context: SymbolicTraceContext, sir_name: str, **kwargs):
         return FallbackWrapper(
             paddle.jit.to_static(
                 compile_sir(context, sir_name),
+                input_spec=[input_spec],
                 build_strategy=build_strategy,
                 backend=backend,
                 full_graph=True,
diff --git a/python/paddle/jit/sot/symbolic/symbolic_context.py b/python/paddle/jit/sot/symbolic/symbolic_context.py
index cc6487f696d0a..4efe3038c2781 100644
--- a/python/paddle/jit/sot/symbolic/symbolic_context.py
+++ b/python/paddle/jit/sot/symbolic/symbolic_context.py
@@ -14,6 +14,10 @@
 
 from __future__ import annotations
 
+from typing import Any, Callable
+
+from paddle.static import InputSpec
+
 from ..utils import log
 from .compile_cache import CompileSIRCache
 from .statement_ir import (
@@ -126,7 +130,15 @@ def replace_TOS(self, sir):
         self.sir_stack.append(sir)
         self.statement_factory.update(sir)
 
-    def compile_do_nothing(self, ret_vals):
+    def return_TOS(self, ret_vals):
+        cur_sir: StatementIR = self.TOS
+        cur_sir.inputs = cur_sir.analyse_inputs()
+        cur_sir.outputs = ret_vals
+        log(2, "start subgraph compile and execution.\n")
+        log(2, self.TOS, "\n")
+        return cur_sir
+
+    def compile_do_nothing(self) -> Callable[[...], Any]:
         """
         Return a dummy function, which will return an empty list.
 
@@ -141,29 +153,12 @@ def __call__(*args, **kwargs):
             def graph_size(self):
                 return 0
 
-        # return None function
-        dummy_stmt_ir = StatementIR("dummy_func")
-        dummy_stmt_ir.outputs = []
-        dummy_stmt_ir.inputs = []
-        return DummyFunc(), dummy_stmt_ir
+        return DummyFunc()
 
-    def compile_fn(self, ret_vals, **kwargs):
+    def compile_fn(self, sir_name: str, input_spec: list[InputSpec], **kwargs):
         """
         start compile and return the python function, which must can be to_static without errors.
         """
-        cur_sir: StatementIR = self.TOS
-        # step0: if no statement, return a dummy function
-        if len(cur_sir.statements) == 0:
-            return self.compile_do_nothing(ret_vals)
-        # step1: analyse sir inputs and outputs
-        cur_sir.inputs = cur_sir.analyse_inputs()
-        # TODO: output analysis
-        cur_sir.outputs = ret_vals
-        log(2, "start subgraph compile and execution.\n")
-        log(2, self.TOS, "\n")
-        # step2: call compile_sir and get python function, third cache is triggered here.
-        static_func = CompileSIRCache()(self, cur_sir.name, **kwargs)
-        # step3: GC and reset TOS
-        # self.reset_TOS()
+        static_func = CompileSIRCache()(self, sir_name, input_spec, **kwargs)
 
-        return static_func, cur_sir
+        return static_func
diff --git a/python/paddle/jit/translated_layer.py b/python/paddle/jit/translated_layer.py
index ddf0cf9c8b02e..c281e335efb3d 100644
--- a/python/paddle/jit/translated_layer.py
+++ b/python/paddle/jit/translated_layer.py
@@ -892,8 +892,7 @@ def _run_dygraph(instance, input, program_holder):
     for i, value in enumerate(input):
         if not isinstance(value, (np.ndarray, core.eager.Tensor)):
             raise TypeError(
-                "The type of input in TranslatedLayer must be numpy array or Variable(Tensor), but received %s."
-                % type(value)
+                f"The type of input in TranslatedLayer must be numpy array or Variable(Tensor), but received {type(value)}."
             )
         # NOTE: In order to unify the API, firstly convert the input to Tensor
         if isinstance(value, np.ndarray):
@@ -925,8 +924,7 @@ def _run_dygraph(instance, input, program_holder):
             persistable_vars.append(instance._buffers[dy_var_name])
         else:
             raise ValueError(
-                "The persistable variable %s does not exist in current TranslatedLayer."
-                % var_name
+                f"The persistable variable {var_name} does not exist in current TranslatedLayer."
             )
 
     output_vars = []
@@ -1426,7 +1424,7 @@ def _construct(model_path, configs=None):
         # 0. dir and filename check
         model_path = os.path.normpath(model_path)
         if not os.path.isdir(model_path):
-            raise ValueError("There is no directory named '%s'" % model_path)
+            raise ValueError(f"There is no directory named '{model_path}'")
         model_filename = None
         params_filename = None
         if configs is not None:
@@ -1591,8 +1589,7 @@ def _get_program_holder(self, method_name='forward'):
         program_holder = self._program_holder_dict.get(method_name, None)
         if program_holder is None:
             raise ValueError(
-                "The method `%s` does not exist in loaded TranslatedLayer."
-                % method_name
+                f"The method `{method_name}` does not exist in loaded TranslatedLayer."
             )
         return program_holder
 
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index a9d8312bb4ca0..42793a6496ad3 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -69,7 +69,9 @@
     Upsample,
     UpsamplingBilinear2D,
     UpsamplingNearest2D,
+    ZeroPad1D,
     ZeroPad2D,
+    ZeroPad3D,
 )
 
 # TODO: import all neural network related api under this directory,
@@ -300,4 +302,6 @@
     'Unflatten',
     'FractionalMaxPool2D',
     'FractionalMaxPool3D',
+    'ZeroPad1D',
+    'ZeroPad3D',
 ]
diff --git a/python/paddle/nn/initializer/__init__.py b/python/paddle/nn/initializer/__init__.py
index e281d6cd48589..270f0bb9234ea 100644
--- a/python/paddle/nn/initializer/__init__.py
+++ b/python/paddle/nn/initializer/__init__.py
@@ -18,7 +18,7 @@
     Assign,
     NumpyArrayInitializer,  # noqa: F401
 )
-from .Bilinear import Bilinear
+from .bilinear import Bilinear
 from .constant import (
     Constant,
     ConstantInitializer,  # noqa: F401
diff --git a/python/paddle/nn/initializer/Bilinear.py b/python/paddle/nn/initializer/bilinear.py
similarity index 100%
rename from python/paddle/nn/initializer/Bilinear.py
rename to python/paddle/nn/initializer/bilinear.py
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index 6faf07bb6eb19..6b34c9fa90f6b 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -1077,6 +1077,67 @@ def extra_repr(self):
         return f'padding={self._pad}, mode={self._mode}, value={self._value}, data_format={self._data_format}{name_str}'
 
 
+class ZeroPad1D(Layer):
+    """
+    This interface is used to construct a callable object of the ``ZeroPad1D`` class.
+    Pads the input tensor boundaries with zero.
+
+    Parameters:
+        padding (Tensor | List[int] | int): The padding size with data type int. If is int, use the
+            same padding in all dimensions. Else [len(padding)/2] dimensions of input will be padded.
+            The pad has the form (pad_left, pad_right).
+        data_format (str): An string from: "NCL", "NLC". Specify the data format of the input data.
+           Default is  "NCL"
+        name (str, optional) : The default value is None.  Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - x(Tensor): The input tensor of zeropad1d operator, which is a 3-D tensor.
+          The data type can be float32, float64.
+        - output(Tensor): The output tensor of zeropad1d operator, which is a 3-D tensor.
+          The data type is same as input x.
+
+    Examples:
+
+        .. code-block:: python
+
+            >>> import paddle
+            >>> import paddle.nn as nn
+
+            >>> input_shape = (1, 2, 3)
+            >>> pad = [1, 2]
+            >>> data = paddle.arange(paddle.prod(paddle.to_tensor(input_shape)), dtype="float32").reshape(input_shape) + 1
+            >>> my_pad = nn.ZeroPad1D(padding=pad)
+            >>> result = my_pad(data)
+            >>> print(result)
+            Tensor(shape=[1, 2, 6], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[0., 1., 2., 3., 0., 0.],
+              [0., 4., 5., 6., 0., 0.]]])
+    """
+
+    def __init__(self, padding, data_format="NCL", name=None):
+        super().__init__()
+        self._pad = _npairs(padding, 1)
+        self._mode = 'constant'
+        self._value = 0.0
+        self._data_format = data_format
+        self._name = name
+
+    def forward(self, x):
+        return F.pad(
+            x,
+            pad=self._pad,
+            mode=self._mode,
+            value=self._value,
+            data_format=self._data_format,
+            name=self._name,
+        )
+
+    def extra_repr(self):
+        name_str = f', name={self._name}' if self._name else ''
+        return f'padding={self._pad}, data_format={self._data_format}{name_str}'
+
+
 class Pad2D(Layer):
     """
     This interface is used to construct a callable object of the ``Pad2D`` class.
@@ -1290,6 +1351,70 @@ def extra_repr(self):
         return f'padding={self._pad}, mode={self._mode}, value={self._value}, data_format={self._data_format}{name_str}'
 
 
+class ZeroPad3D(Layer):
+    """
+    This interface is used to construct a callable object of the ``ZeroPad3D`` class.
+    Pads the input tensor boundaries with zero.
+
+    Parameters:
+        padding (Tensor | List[int] | int): The padding size with data type int. If is int, use the
+            same padding in all dimensions. Else [len(padding)/2] dimensions of input will be padded.
+            The pad has the form (pad_left, pad_right, pad_top, pad_bottom, pad_front, pad_back).
+        data_format (str): An string from: "NCDHW", "NDHWC". Specify the data format of the input data.
+           Default is  "NCDHW"
+        name (str, optional) : The default value is None.  Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - x(Tensor): The input tensor of zeropad3d operator, which is a 5-D tensor.
+          The data type can be float32, float64.
+        - output(Tensor): The output tensor of zeropad3d operator, which is a 5-D tensor.
+          The data type is same as input x.
+
+    Examples:
+
+        .. code-block:: python
+
+            >>> import paddle
+            >>> import paddle.nn as nn
+
+            >>> input_shape = (1, 1, 1, 2, 3)
+            >>> pad = [1, 0, 1, 2, 0, 0]
+            >>> data = paddle.arange(paddle.prod(paddle.to_tensor(input_shape)), dtype="float32").reshape(input_shape) + 1
+            >>> my_pad = nn.ZeroPad3D(padding=pad)
+            >>> result = my_pad(data)
+            >>> print(result)
+            Tensor(shape=[1, 1, 1, 5, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[[[0., 0., 0., 0.],
+                [0., 1., 2., 3.],
+                [0., 4., 5., 6.],
+                [0., 0., 0., 0.],
+                [0., 0., 0., 0.]]]]])
+    """
+
+    def __init__(self, padding, data_format="NCDHW", name=None):
+        super().__init__()
+        self._pad = _npairs(padding, 3)
+        self._mode = 'constant'
+        self._value = 0.0
+        self._data_format = data_format
+        self._name = name
+
+    def forward(self, x):
+        return F.pad(
+            x,
+            pad=self._pad,
+            mode=self._mode,
+            value=self._value,
+            data_format=self._data_format,
+            name=self._name,
+        )
+
+    def extra_repr(self):
+        name_str = f', name={self._name}' if self._name else ''
+        return f'padding={self._pad}, data_format={self._data_format}{name_str}'
+
+
 class CosineSimilarity(Layer):
     """
     This interface is used to compute cosine similarity between x1 and x2 along axis.
diff --git a/python/paddle/nn/quant/quantized_linear.py b/python/paddle/nn/quant/quantized_linear.py
index 1c2d962f720cf..41ad1839e1f8a 100644
--- a/python/paddle/nn/quant/quantized_linear.py
+++ b/python/paddle/nn/quant/quantized_linear.py
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from paddle import _C_ops, version
+import paddle
+from paddle import _C_ops
 from paddle.base.data_feeder import check_dtype
 from paddle.base.framework import convert_np_dtype_to_dtype_
 from paddle.device.cuda import get_device_capability
@@ -24,7 +25,7 @@
 
 def _get_arch_info():
     # Get SMVersion from device.
-    cuda_version = version.cuda()
+    cuda_version = paddle.version.cuda()
     if cuda_version is not None and cuda_version != 'False':
         major, minor = get_device_capability()
         arch = int(major * 10 + minor)
diff --git a/python/paddle/static/io.py b/python/paddle/static/io.py
index 469145ac6a832..e508cbdd43c19 100644
--- a/python/paddle/static/io.py
+++ b/python/paddle/static/io.py
@@ -63,6 +63,7 @@
     load_pir,
     load_pir_inference_model,
     load_vars_pir,
+    normalize_pir_program,
     save_pir,
     save_pir_inference_model,
     save_vars_pir,
@@ -183,6 +184,8 @@ def normalize_program(program, feed_vars, fetch_vars, **kwargs):
             >>> normalized_program = paddle.static.normalize_program(program, [image], [predict])
 
     """
+    if in_pir_mode():
+        return normalize_pir_program(program, feed_vars, fetch_vars, **kwargs)
     if not isinstance(program, Program):
         raise TypeError(
             "program type must be `base.Program`, but received `%s`"
diff --git a/python/paddle/static/nn/metric.py b/python/paddle/static/nn/metric.py
index d2252ebc0a0bc..2be2cecf18742 100644
--- a/python/paddle/static/nn/metric.py
+++ b/python/paddle/static/nn/metric.py
@@ -245,6 +245,28 @@ def auc(
             [array(1.)]
 
     """
+    if in_pir_mode():
+        if ins_tag_weight is None:
+            ins_tag_weight = paddle.full(
+                shape=[1, 1], dtype="float32", fill_value=1.0
+            )
+        stat_pos = paddle.zeros(shape=[1, num_thresholds + 1], dtype="int64")
+        stat_neg = paddle.zeros(shape=[1, num_thresholds + 1], dtype="int64")
+        auc_out, batch_stat_pos, batch_stat_neg = _C_ops.auc(
+            input,
+            label,
+            stat_pos,
+            stat_neg,
+            ins_tag_weight,
+            curve,
+            num_thresholds,
+            slide_steps,
+        )
+        return (
+            auc_out,
+            batch_stat_pos,
+            batch_stat_neg,
+        )
     helper = LayerHelper("auc", **locals())
 
     if ins_tag_weight is None:
diff --git a/python/paddle/static/pir_io.py b/python/paddle/static/pir_io.py
index 38e5e69cfdbb1..bd9b5305b7696 100644
--- a/python/paddle/static/pir_io.py
+++ b/python/paddle/static/pir_io.py
@@ -251,7 +251,13 @@ def normalize_pir_program(program, feed_vars, fetch_vars, **kwargs):
     if not all(isinstance(v, pir.Value) for v in fetch_vars):
         raise TypeError("fetch_vars type must be a Value or a list of Value.")
 
-    # TODO(Ruting) remind users to set auc_states to 0 if auc op were found.
+    # remind users to set auc_states to 0 if auc op were found.
+    for op in program.global_block().ops:
+        if op.name() == 'pd_op.auc':
+            warnings.warn(
+                "Be sure that you have set auc states to 0 before saving inference model."
+            )
+            break
 
     # fix the bug that the activation op's output as target will be pruned.
     # will affect the inference performance.
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index 4de5e392a8493..553ea2cc5bbee 100644
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -142,6 +142,7 @@
     atleast_1d,
     atleast_2d,
     atleast_3d,
+    block_diag,
     broadcast_tensors,
     broadcast_to,
     cast,
@@ -306,6 +307,7 @@
     inner,
     inverse,
     isfinite,
+    isin,
     isinf,
     isnan,
     isneginf,
@@ -544,6 +546,7 @@
     'hypot_',
     'nansum',
     'nanmean',
+    'block_diag',
     'count_nonzero',
     'tanh',
     'tanh_',
@@ -587,6 +590,7 @@
     'kron',
     'kthvalue',
     'isfinite',
+    'isin',
     'isinf',
     'isnan',
     'isneginf',
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 24c60af7499e6..9f0d808a8b2b4 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -1644,7 +1644,7 @@ def meshgrid(*args, **kwargs):
 
     Args:
         *args(Tensor|list of Tensor) : tensors (tuple(list) of tensor): the shapes of input k tensors are (N1,),
-            (N2,),..., (Nk,). Support data types: ``float64``, ``float16``, ``float32``, ``int32``, ``int64``.
+            (N2,),..., (Nk,). Support data types: ``float64``, ``bfloat16``, ``float16``, ``float32``, ``int32``, ``int64``, ``complex64``, ``complex128``.
         **kwargs (optional): Currently, only accept name in **kwargs
             The default value is None. Normally there is no need for
             user to set this property. For more information, please refer to :ref:`api_guide_Name`.
@@ -1686,7 +1686,16 @@ def meshgrid(*args, **kwargs):
             check_dtype(
                 input_.dtype,
                 'create data type',
-                ['uint16', 'float16', 'float32', 'float64', 'int32', 'int64'],
+                [
+                    'uint16',
+                    'float16',
+                    'float32',
+                    'float64',
+                    'int32',
+                    'int64',
+                    'complex64',
+                    'complex128',
+                ],
                 'meshgrid',
             )
 
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 9803d4a8c5c0a..2ba51595cc94d 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -3971,7 +3971,7 @@ def tile(x, repeat_times, name=None):
     Both the number of dimensions of ``x`` and the number of elements in ``repeat_times`` should be less than or equal to 6.
 
     Args:
-        x (Tensor): The input tensor, its data type should be bool, float16, float32, float64, int32 or int64.
+        x (Tensor): The input tensor, its data type should be bool, float16, float32, float64, int32, int64, complex64 or complex128.
         repeat_times (list|tuple|Tensor): The number of repeating times. If repeat_times is a list or tuple, all its elements
             should be integers or 1-D Tensors with the data type int32. If repeat_times is a Tensor, it should be an 1-D Tensor with the data type int32.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
@@ -4038,6 +4038,8 @@ def check_input(x, repeat_times):
                 'float64',
                 'int32',
                 'int64',
+                'complex64',
+                'complex128',
             ],
             'tile',
         )
@@ -4209,7 +4211,7 @@ def expand(x, shape, name=None):
     Both the number of dimensions of ``x`` and the number of elements in ``shape`` should be less than or equal to 6. And the number of dimensions of ``x`` should be less than the number of elements in ``shape``. The dimension to expand must have a value 0.
 
     Args:
-        x (Tensor): The input Tensor, its data type is bool, float16, float32, float64, int32, int64, uint8 or uint16.
+        x (Tensor): The input Tensor, its data type is bool, float16, float32, float64, int32, int64, uint8, uint16, complex64 or complex128.
         shape (list|tuple|Tensor): The result shape after expanding. The data type is int32. If shape is a list or tuple, all its elements
             should be integers or 0-D or 1-D Tensors with the data type int32. If shape is a Tensor, it should be an 1-D Tensor with the data type int32.
             The value -1 in shape means keeping the corresponding dimension unchanged.
@@ -4275,6 +4277,8 @@ def expand(x, shape, name=None):
                 'int64',
                 'uint8',
                 'uint16',
+                'complex64',
+                'complex128',
             ],
             'expand',
         )
@@ -6861,3 +6865,67 @@ def slice_scatter(x, value, axes, starts, ends, strides, name=None):
         )
 
         return output
+
+
+def block_diag(inputs, name=None):
+    """
+    Create a block diagonal matrix from provided tensors.
+
+    Args:
+        inputs (list|tuple): ``inputs`` is a Tensor list or Tensor tuple, one or more tensors with 0, 1, or 2 dimensions.
+        name (str, optional): Name for the operation (optional, default is None).
+
+    Returns:
+        Tensor, A ``Tensor``. The data type is same as ``inputs``.
+
+    Examples:
+        .. code-block:: python
+
+            >>> import paddle
+
+            >>> A = paddle.to_tensor([[4], [3], [2]])
+            >>> B = paddle.to_tensor([7, 6, 5])
+            >>> C = paddle.to_tensor(1)
+            >>> D = paddle.to_tensor([[5, 4, 3], [2, 1, 0]])
+            >>> E = paddle.to_tensor([[8, 7], [7, 8]])
+            >>> out = paddle.block_diag([A, B, C, D, E])
+            >>> print(out)
+            Tensor(shape=[9, 10], dtype=int64, place=Place(gpu:0), stop_gradient=True,
+                [[4, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                [3, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                [2, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                [0, 7, 6, 5, 0, 0, 0, 0, 0, 0],
+                [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
+                [0, 0, 0, 0, 0, 5, 4, 3, 0, 0],
+                [0, 0, 0, 0, 0, 2, 1, 0, 0, 0],
+                [0, 0, 0, 0, 0, 0, 0, 0, 8, 7],
+                [0, 0, 0, 0, 0, 0, 0, 0, 7, 8]])
+    """
+
+    def to_col_block(arys, i, a):
+        return [
+            a
+            if idx == i
+            else paddle.zeros([ary.shape[0], a.shape[1]], dtype=a.dtype)
+            for idx, ary in enumerate(arys)
+        ]
+
+    def to_2d(ary):
+        if ary.ndim == 0:
+            return ary.unsqueeze(axis=0).unsqueeze(axis=0)
+        if ary.ndim == 1:
+            return ary.unsqueeze(axis=0)
+        if ary.ndim == 2:
+            return ary
+        raise ValueError(
+            "For 'block_diag', the dimension of each elements in 'inputs' must be 0, 1, or 2, but got "
+            f"{ary.ndim}"
+        )
+
+    arys = [to_2d(ary) for ary in inputs]
+
+    matrix = [
+        paddle.concat(to_col_block(arys, idx, ary), axis=0)
+        for idx, ary in enumerate(arys)
+    ]
+    return paddle.concat(matrix, axis=1)
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index d7d8669ff0c3b..3df4cf88c94b6 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -2726,7 +2726,7 @@ def inverse(x, name=None):
         x (Tensor): The input tensor. The last two
             dimensions should be equal. When the number of dimensions is
             greater than 2, it is treated as batches of square matrix. The data
-            type can be float32 and float64.
+            type can be float32, float64, complex64, complex128.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -2751,7 +2751,12 @@ def inverse(x, name=None):
     else:
 
         def _check_input(x):
-            check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'inverse')
+            check_variable_and_dtype(
+                x,
+                'x',
+                ['float32', 'float64', 'complex64', 'complex128'],
+                'inverse',
+            )
             if len(x.shape) < 2:
                 raise ValueError(
                     "The input of inverse is expected to be a Tensor whose number "
@@ -7969,3 +7974,187 @@ def sinc_(x, name=None):
     paddle.sin_(x)
     paddle.divide_(x, tmp)
     return paddle.where(~paddle.isnan(x), x, paddle.full_like(x, 1.0))
+
+
+def isin(x, test_x, assume_unique=False, invert=False, name=None):
+    r"""
+    Tests if each element of `x` is in `test_x`.
+
+    Args:
+        x (Tensor): The input Tensor. Supported data type: 'bfloat16', 'float16', 'float32', 'float64', 'int32', 'int64'.
+        test_x (Tensor): Tensor values against which to test for each input element. Supported data type: 'bfloat16', 'float16', 'float32', 'float64', 'int32', 'int64'.
+        assume_unique (bool, optional): If True, indicates both `x` and `test_x` contain unique elements, which could make the calculation faster. Default: False.
+        invert (bool, optional): Indicate whether to invert the boolean return tensor. If True, invert the results. Default: False.
+        name (str, optional): Name for the operation (optional, default is None).For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        out (Tensor), The output Tensor with the same shape as `x`.
+
+    Examples:
+        .. code-block:: python
+
+            >>> import paddle
+            >>> paddle.set_device('cpu')
+            >>> x = paddle.to_tensor([-0., -2.1, 2.5, 1.0, -2.1], dtype='float32')
+            >>> test_x = paddle.to_tensor([-2.1, 2.5], dtype='float32')
+            >>> res = paddle.isin(x, test_x)
+            >>> print(res)
+            Tensor(shape=[5], dtype=bool, place=Place(cpu), stop_gradient=True,
+            [False, True, True, False, True])
+
+            >>> x = paddle.to_tensor([-0., -2.1, 2.5, 1.0, -2.1], dtype='float32')
+            >>> test_x = paddle.to_tensor([-2.1, 2.5], dtype='float32')
+            >>> res = paddle.isin(x, test_x, invert=True)
+            >>> print(res)
+            Tensor(shape=[5], dtype=bool, place=Place(cpu), stop_gradient=True,
+            [True, False, False, True, False])
+
+            >>> # Set `assume_unique` to True only when `x` and `test_x` contain unique values, otherwise the result may be incorrect.
+            >>> x = paddle.to_tensor([0., 1., 2.]*20).reshape([20, 3])
+            >>> test_x = paddle.to_tensor([0., 1.]*20)
+            >>> correct_result = paddle.isin(x, test_x, assume_unique=False)
+            >>> print(correct_result)
+            Tensor(shape=[20, 3], dtype=bool, place=Place(cpu), stop_gradient=True,
+            [[True , True , False],
+             [True , True , False],
+             [True , True , False],
+             [True , True , False],
+             [True , True , False],
+             [True , True , False],
+             [True , True , False],
+             [True , True , False],
+             [True , True , False],
+             [True , True , False],
+             [True , True , False],
+             [True , True , False],
+             [True , True , False],
+             [True , True , False],
+             [True , True , False],
+             [True , True , False],
+             [True , True , False],
+             [True , True , False],
+             [True , True , False],
+             [True , True , False]])
+
+            >>> incorrect_result = paddle.isin(x, test_x, assume_unique=True)
+            >>> print(incorrect_result)
+            Tensor(shape=[20, 3], dtype=bool, place=Place(gpu:0), stop_gradient=True,
+            [[True , True , True ],
+             [True , True , True ],
+             [True , True , True ],
+             [True , True , True ],
+             [True , True , True ],
+             [True , True , True ],
+             [True , True , True ],
+             [True , True , True ],
+             [True , True , True ],
+             [True , True , True ],
+             [True , True , True ],
+             [True , True , True ],
+             [True , True , True ],
+             [True , True , True ],
+             [True , True , True ],
+             [True , True , True ],
+             [True , True , True ],
+             [True , True , True ],
+             [True , True , True ],
+             [True , True , False]])
+
+    """
+    if not isinstance(x, (paddle.Tensor, Variable, paddle.pir.Value)):
+        raise TypeError(f"x must be tensor type, but got {type(x)}")
+    if not isinstance(test_x, (paddle.Tensor, Variable, paddle.pir.Value)):
+        raise TypeError(f"x must be tensor type, but got {type(test_x)}")
+
+    check_variable_and_dtype(
+        x,
+        "x",
+        [
+            'uint16',
+            'float16',
+            'float32',
+            'float64',
+            'int32',
+            'int64',
+        ],
+        "isin",
+    )
+
+    check_variable_and_dtype(
+        test_x,
+        "test_x",
+        [
+            'uint16',
+            'float16',
+            'float32',
+            'float64',
+            'int32',
+            'int64',
+        ],
+        "isin",
+    )
+
+    x_zero_dim = False
+    if len(x.shape) == 0:
+        x = x.reshape([1])
+        x_zero_dim = True
+
+    size_x = math.prod(x.shape)
+    size_t = math.prod(test_x.shape)
+    if size_t < math.pow(size_x, 0.145) * 10.0:
+        # use brute-force searching if the test_x size is small
+        if len(x.shape) == 0:
+            return paddle.zeros([], dtype='bool')
+
+        tmp = x.reshape(tuple(x.shape) + ((1,) * test_x.ndim))
+        cmp = tmp == test_x
+        dim = tuple(range(-1, -test_x.ndim - 1, -1))
+        cmp = cmp.any(axis=dim)
+        if invert:
+            cmp = ~cmp
+    else:
+        x_flat = x.flatten()
+        test_x_flat = test_x.flatten()
+        if assume_unique:
+            # if x and test_x both contain unique elements, use stable argsort method which could be faster
+            all_elements = paddle.concat([x_flat, test_x_flat])
+            sorted_index = paddle.argsort(all_elements, stable=True)
+            sorted_x = all_elements[sorted_index]
+
+            duplicate_mask = paddle.full_like(sorted_index, False, dtype='bool')
+            if not in_dynamic_mode():
+                duplicate_mask = paddle.static.setitem(
+                    duplicate_mask,
+                    paddle.arange(duplicate_mask.numel() - 1),
+                    sorted_x[1:] == sorted_x[:-1],
+                )
+            else:
+                duplicate_mask[:-1] = sorted_x[1:] == sorted_x[:-1]
+
+            if invert:
+                duplicate_mask = duplicate_mask.logical_not()
+
+            mask = paddle.empty_like(duplicate_mask)
+            if not in_dynamic_or_pir_mode():
+                mask = paddle.static.setitem(mask, sorted_index, duplicate_mask)
+            else:
+                mask[sorted_index] = duplicate_mask
+
+            cmp = mask[0 : x.numel()].reshape(x.shape)
+        else:
+            # otherwise use searchsorted method
+            sorted_test_x = paddle.sort(test_x_flat)
+            idx = paddle.searchsorted(sorted_test_x, x_flat)
+            test_idx = paddle.where(
+                idx < sorted_test_x.numel(),
+                idx,
+                paddle.zeros_like(idx, 'int64'),
+            )
+            cmp = sorted_test_x[test_idx] == x_flat
+            cmp = cmp.logical_not() if invert else cmp
+            cmp = cmp.reshape(x.shape)
+
+    if x_zero_dim:
+        return cmp.reshape([])
+    else:
+        return cmp
diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index 736ae891f2fb8..9ec4cd1e2ec7f 100755
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -130,6 +130,7 @@ def argsort(x, axis=-1, descending=False, stable=False, name=None):
             x,
             'x',
             [
+                'uint16',
                 'float16',
                 'float32',
                 'float64',
diff --git a/python/paddle/tensor/tensor.prototype.pyi b/python/paddle/tensor/tensor.prototype.pyi
index 735c8da282545..9b011b602b5e3 100644
--- a/python/paddle/tensor/tensor.prototype.pyi
+++ b/python/paddle/tensor/tensor.prototype.pyi
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# The `Tensor` template for `tools/gen_tensor_stub.py` generates the stub file `tensor.pyi`.
-# Add docstring, attributes, methods and alias with type annotaions for `Tensor`
+# The `Tensor` template `tensor.prototype.pyi` for `tools/gen_tensor_stub.py` to generate the stub file `tensor.pyi`.
+# Add docstring, attributes, methods and alias with type annotaions for `Tensor` in `tensor.prototype.pyi`
 # if not conveniently coding in original place (like c++ source file).
 
 from typing import Any, overload
diff --git a/python/setup.py.in b/python/setup.py.in
index 67d23a089aa37..98ccf8c61e41c 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -1054,6 +1054,36 @@ if '${WITH_STRIP}' == 'ON':
     if os.system(command) != 0:
         raise Exception("strip *.so failed, command: %s" % command)
 
+
+def check_build_dependency():
+    missing_modules = '''Missing build dependency: {dependency}
+Please run 'pip install -r python/requirements.txt' to make sure you have all the dependencies installed.
+'''.strip()
+
+    with open('${PADDLE_SOURCE_DIR}' + '/python/requirements.txt') as f:
+        build_dependencies = (
+            f.read().splitlines()
+        )  # Specify the dependencies to install
+
+    python_dependencies_module = []
+    installed_packages = []
+
+    for dependency in build_dependencies:
+        python_dependencies_module.append(
+            re.sub("_|-", '', re.sub(r"==.*|>=.*|<=.*", '', dependency))
+        )
+    reqs = subprocess.check_output([sys.executable, '-m', 'pip', 'freeze'])
+
+    for r in reqs.split():
+        installed_packages.append(
+            re.sub("_|-", '', r.decode().split('==')[0]).lower()
+        )
+
+    for dependency in python_dependencies_module:
+        if dependency.lower() not in installed_packages:
+            raise RuntimeError(missing_modules.format(dependency=dependency))
+
+
 def install_cpp_dist_and_build_test(paddle_install_dir, paddle_lib_test_dir):
     """install cpp distribution and build test target
 
@@ -1095,6 +1125,9 @@ def install_cpp_dist_and_build_test(paddle_install_dir, paddle_lib_test_dir):
     subprocess.check_call(["cmake", "--build", paddle_lib_test_dir])
 
 
+# check build dependency
+check_build_dependency()
+
 # install cpp distribution
 if '${WITH_CPP_DIST}' == 'ON':
     paddle_install_dir = '${PADDLE_INSTALL_DIR}'
@@ -1112,6 +1145,28 @@ package_data['paddle.base'] = package_data.get('paddle.base', []) + [
 package_data['paddle.tensor'] = package_data.get('paddle.tensor', []) + ['tensor.pyi']
 
 
+def generate_tensor_stub(paddle_binary_dir, paddle_source_dir):
+    print('-'*2, 'Generate stub file tensor.pyi ... ')
+    script_path = paddle_source_dir + '/tools/'
+    sys.path.append(script_path)
+    import gen_tensor_stub
+
+    gen_tensor_stub.generate_stub_file(
+        input_file=paddle_source_dir
+        + '/python/paddle/tensor/tensor.prototype.pyi',
+        output_file=paddle_binary_dir + '/python/paddle/tensor/tensor.pyi',
+    )
+
+    shutil.copy(
+        paddle_binary_dir + '/python/paddle/tensor/tensor.pyi',
+        paddle_source_dir + '/python/paddle/tensor/tensor.pyi',
+    )
+    print('-'*2, 'End Generate stub file tensor.pyi ... ')
+
+# generate stub file `tensor.pyi`
+generate_tensor_stub('${PADDLE_BINARY_DIR}', '${PADDLE_SOURCE_DIR}')
+
+
 with redirect_stdout():
     setup(name='${PACKAGE_NAME}',
         version='${PADDLE_VERSION}',
diff --git a/python/setup_cinn.py.in b/python/setup_cinn.py.in
index 597e9b9187f6c..aa68da69a9f7c 100644
--- a/python/setup_cinn.py.in
+++ b/python/setup_cinn.py.in
@@ -220,10 +220,10 @@ if platform.system() == 'Linux' and platform.machine() == 'x86_64':
             cuda_major_version = version.split('.')[0]
         except Exception as e:
             raise ValueError("CUDA not found")
-        
+
         install_requires.append(PADDLE_CUDA_INSTALL_REQUIREMENTS[cuda_major_version].split("|"))
-        
-        
+
+
 
 with redirect_stdout():
     setup(
diff --git a/python/unittest_py/requirements.txt b/python/unittest_py/requirements.txt
index 15cf679177709..40f16161ab71e 100644
--- a/python/unittest_py/requirements.txt
+++ b/python/unittest_py/requirements.txt
@@ -19,3 +19,4 @@ wandb>=0.13 ; python_version<"3.12"
 xlsxwriter==3.0.9
 xdoctest==1.1.1
 ubelt==1.3.3 # just for xdoctest
+mypy==1.10.0
diff --git a/setup.py b/setup.py
index aab6fe0bcfd82..6d9ce542c6a15 100644
--- a/setup.py
+++ b/setup.py
@@ -1796,6 +1796,25 @@ def submodules_not_exists_or_empty(folder):
             sys.exit(1)
 
 
+def generate_tensor_stub(paddle_binary_dir, paddle_source_dir):
+    print('-' * 2, 'Generate stub file tensor.pyi ... ')
+    script_path = paddle_source_dir + '/tools/'
+    sys.path.append(script_path)
+    import gen_tensor_stub
+
+    gen_tensor_stub.generate_stub_file(
+        input_file=paddle_source_dir
+        + '/python/paddle/tensor/tensor.prototype.pyi',
+        output_file=paddle_binary_dir + '/python/paddle/tensor/tensor.pyi',
+    )
+
+    shutil.copy(
+        paddle_binary_dir + '/python/paddle/tensor/tensor.pyi',
+        paddle_source_dir + '/python/paddle/tensor/tensor.pyi',
+    )
+    print('-' * 2, 'End Generate stub file tensor.pyi ... ')
+
+
 def main():
     # Parse the command line and check arguments before we proceed with building steps and setup
     parse_input_command(filter_args_list)
@@ -1875,6 +1894,9 @@ def main():
             package_data['paddle.libs'],
         )
 
+    # generate stub file `tensor.pyi`
+    generate_tensor_stub(paddle_binary_dir, paddle_source_dir)
+
     setup(
         name=package_name,
         version=paddle_version,
diff --git a/test/auto_parallel/hybrid_strategy/pir_reshard_nd_mesh.py b/test/auto_parallel/hybrid_strategy/pir_reshard_nd_mesh.py
index 595f58b206193..8cf3f185dcbfc 100644
--- a/test/auto_parallel/hybrid_strategy/pir_reshard_nd_mesh.py
+++ b/test/auto_parallel/hybrid_strategy/pir_reshard_nd_mesh.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import os
-import unittest
 
 import paddle
 import paddle.distributed as dist
@@ -183,14 +182,11 @@ def run_pr_to_rs_case(self):
         tgt_out_value = (self._mesh.process_ids, [-1, 1, -1], {})
 
     def run_pr_to_ss_case(self):
-        # [Partial(), Replicate()] --> [Shard(0), Shard(1)]
-        # raise NotImplementedError
-        with unittest.TestCase().assertRaises(NotImplementedError):
-            self.create_program(
-                [self.BATCH_SIZE, self.SEQ_LEN, self.HIDDEN_SIZE],
-                [dist.Partial(dist.ReduceType.kRedSum), dist.Replicate()],
-                [dist.Shard(0), dist.Shard(1)],
-            )
+        self.create_program(
+            [self.BATCH_SIZE, self.SEQ_LEN, self.HIDDEN_SIZE],
+            [dist.Partial(dist.ReduceType.kRedSum), dist.Replicate()],
+            [dist.Shard(0), dist.Shard(1)],
+        )
 
     def run_ss_to_ss_case(self):
         # [Shard(0), Shard(1)] --> [Shard(1), Shard(0)]
diff --git a/test/auto_parallel/hybrid_strategy/pir_reshard_nd_mesh_cross_mesh.py b/test/auto_parallel/hybrid_strategy/pir_reshard_nd_mesh_cross_mesh.py
index 47bfb9a44df06..532426208c1ee 100644
--- a/test/auto_parallel/hybrid_strategy/pir_reshard_nd_mesh_cross_mesh.py
+++ b/test/auto_parallel/hybrid_strategy/pir_reshard_nd_mesh_cross_mesh.py
@@ -102,7 +102,7 @@ def run_pp_to_rr_case(self):
 
         rank_id = dist.get_rank()
         if rank_id in self._mesh0.process_ids:
-            assert new_ops_name[-1] == "pd_op.send_v2"
+            assert new_ops_name[2] == "pd_op.send_v2"
         else:
             assert new_ops_name[2] == "pd_op.recv_v2"
             assert new_ops_name[-2] == "pd_op.c_allreduce_sum_"
diff --git a/test/auto_parallel/pir/mlp_demo_3d.py b/test/auto_parallel/pir/mlp_demo_3d.py
index 41ac0d25f682a..a743aa218e659 100644
--- a/test/auto_parallel/pir/mlp_demo_3d.py
+++ b/test/auto_parallel/pir/mlp_demo_3d.py
@@ -118,50 +118,43 @@ def test_to_static_program(self):
         rank = paddle.distributed.get_rank()
         ops = dist_program.global_block().ops
         op_names = [op.name() for op in ops]
-        if rank < 4:
-            std_ops = [
-                'pd_op.data',
-                'builtin.parameter',
-                'pd_op.data',
-                'pd_op.relu',
-                'pd_op.matmul',
-                'pd_op.relu',
-                'dist_op.reshard',
-                'dist_op.reshard',
-                'pd_op.relu_grad',
-                'pd_op.matmul_grad',
-                'dist_op.reshard',
-                'dist_op.reshard',
-                'pd_op.relu_grad',
-                'pd_op.sgd_',
-            ]
-        else:
-            std_ops = [
-                'pd_op.data',
-                'builtin.parameter',
-                'pd_op.data',
-                'dist_op.reshard',
-                'pd_op.matmul',
-                'dist_op.reshard',
-                'pd_op.relu',
-                'pd_op.subtract',
-                'pd_op.square',
-                'pd_op.mean',
-                'builtin.shadow_output',
-                'pd_op.full',
-                'pd_op.full_like',
-                'dist_op.reshard',
-                'pd_op.mean_grad',
-                'dist_op.reshard',
-                'pd_op.square_grad',
-                'pd_op.subtract_grad',
-                'pd_op.relu_grad',
-                'pd_op.matmul_grad',
-                'dist_op.reshard',
-                'dist_op.reshard',
-                'pd_op.sgd_',
-            ]
-
+        std_ops = [
+            'pd_op.data',
+            'pd_op.data',
+            'builtin.parameter',
+            'builtin.parameter',
+            'pd_op.data',
+            'pd_op.data',
+            'pd_op.relu',
+            'pd_op.matmul',
+            'pd_op.relu',
+            'dist_op.reshard',
+            'pd_op.matmul',
+            'dist_op.reshard',
+            'pd_op.relu',
+            'pd_op.subtract',
+            'pd_op.square',
+            'pd_op.mean',
+            'builtin.shadow_output',
+            'pd_op.full',
+            'pd_op.full_like',
+            'dist_op.reshard',
+            'pd_op.mean_grad',
+            'dist_op.reshard',
+            'pd_op.square_grad',
+            'pd_op.subtract_grad',
+            'pd_op.relu_grad',
+            'pd_op.matmul_grad',
+            'dist_op.reshard',
+            'dist_op.reshard',
+            'pd_op.relu_grad',
+            'pd_op.matmul_grad',
+            'dist_op.reshard',
+            'dist_op.reshard',
+            'pd_op.relu_grad',
+            'pd_op.sgd_',
+            'pd_op.sgd_',
+        ]
         assert op_names == std_ops
 
     def test_loss_value(self):
diff --git a/test/auto_parallel/pir/pir_reshard_s_to_r.py b/test/auto_parallel/pir/pir_reshard_s_to_r.py
index 933eb855730ea..1d4afcddf0d64 100644
--- a/test/auto_parallel/pir/pir_reshard_s_to_r.py
+++ b/test/auto_parallel/pir/pir_reshard_s_to_r.py
@@ -81,7 +81,7 @@ def run_pir_test_case(self):
                 std_ops,
             )
         elif self._shard == 1:
-            np.testing.assert_equal(main_program.num_ops(), 10)
+            np.testing.assert_equal(main_program.num_ops(), 8)
             std_ops = [
                 'builtin.parameter',
                 'pd_op.data',
@@ -89,9 +89,7 @@ def run_pir_test_case(self):
                 'pd_op.c_allgather',
                 'pd_op.full',
                 'pd_op.split_with_num',
-                'builtin.split',
                 'pd_op.full',
-                'builtin.combine',
                 'pd_op.concat',
             ]
 
diff --git a/test/auto_parallel/pir/pir_reshard_s_to_r_cross_mesh.py b/test/auto_parallel/pir/pir_reshard_s_to_r_cross_mesh.py
index 771fbf29491ba..6b2fab19e2dab 100644
--- a/test/auto_parallel/pir/pir_reshard_s_to_r_cross_mesh.py
+++ b/test/auto_parallel/pir/pir_reshard_s_to_r_cross_mesh.py
@@ -65,12 +65,14 @@ def run_pir_test_case(self):
         ops = [op.name() for op in main_program.global_block().ops]
         if self._shard == 0:
             if paddle.distributed.get_rank() == 0:
-                np.testing.assert_equal(main_program.num_ops(), 4)
+                np.testing.assert_equal(main_program.num_ops(), 6)
                 std_ops = [
                     'builtin.parameter',
                     'pd_op.data',
                     'dist_op.shard_tensor',
                     'pd_op.send_v2',
+                    'dist_op.reshard',
+                    'pd_op.c_allgather',
                 ]
                 np.testing.assert_equal(
                     ops,
@@ -91,19 +93,25 @@ def run_pir_test_case(self):
                 )
         elif self._shard == 1:
             if paddle.distributed.get_rank() == 0:
-                np.testing.assert_equal(main_program.num_ops(), 4)
+                np.testing.assert_equal(main_program.num_ops(), 10)
                 std_ops = [
                     'builtin.parameter',
                     'pd_op.data',
                     'dist_op.shard_tensor',
                     'pd_op.send_v2',
+                    'dist_op.reshard',
+                    'pd_op.c_allgather',
+                    'pd_op.full',
+                    'pd_op.split_with_num',
+                    'pd_op.full',
+                    'pd_op.concat',
                 ]
                 np.testing.assert_equal(
                     ops,
                     std_ops,
                 )
             elif paddle.distributed.get_rank() == 1:
-                np.testing.assert_equal(main_program.num_ops(), 11)
+                np.testing.assert_equal(main_program.num_ops(), 9)
                 std_ops = [
                     'builtin.parameter',
                     'pd_op.data',
@@ -112,9 +120,7 @@ def run_pir_test_case(self):
                     'pd_op.c_allgather',
                     'pd_op.full',
                     'pd_op.split_with_num',
-                    'builtin.split',
                     'pd_op.full',
-                    'builtin.combine',
                     'pd_op.concat',
                 ]
 
diff --git a/test/auto_parallel/reshard_p_to_r_cross_mesh.py b/test/auto_parallel/reshard_p_to_r_cross_mesh.py
index 6960530bf3bb3..605a245cd19db 100644
--- a/test/auto_parallel/reshard_p_to_r_cross_mesh.py
+++ b/test/auto_parallel/reshard_p_to_r_cross_mesh.py
@@ -90,12 +90,14 @@ def run_pir_static_test_case(self):
 
         ops = [op.name() for op in main_program.global_block().ops]
         if paddle.distributed.get_rank() == 0:
-            np.testing.assert_equal(main_program.num_ops(), 4)
+            np.testing.assert_equal(main_program.num_ops(), 6)
             std_ops = [
                 'builtin.parameter',
                 'pd_op.data',
                 'dist_op.shard_tensor',
                 'pd_op.send_v2',
+                'dist_op.reshard',
+                'pd_op.c_allreduce_sum_',
             ]
         else:
             np.testing.assert_equal(main_program.num_ops(), 5)
diff --git a/test/auto_parallel/spmd_rules/test_flatten_rule.py b/test/auto_parallel/spmd_rules/test_flatten_rule.py
index 599b2ddf4bf95..9a9ae6b921842 100644
--- a/test/auto_parallel/spmd_rules/test_flatten_rule.py
+++ b/test/auto_parallel/spmd_rules/test_flatten_rule.py
@@ -38,7 +38,7 @@ def setUp(self):
 
     def test_flatten_infer_forward(self):
         # shape: [8, 16, 8, 24] --> [8, 16 * 8, 24]
-        # dims_mapping: [0, -1, -1, 1] --> [0, -1, -1, 1] [ 0, -1, 1]
+        # dims_mapping: [0, -1, -1, 1] --> [0, -1, -1, 1], ([0, -1, 1], [-1, 0, -1, -1, 1] // xshape)
         self.x_dist_tensor_spec.set_dims_mapping([0, -1, -1, 1])
         self.attrs['start_axis'] = 1
         self.attrs['stop_axis'] = 2
@@ -51,14 +51,17 @@ def test_flatten_infer_forward(self):
         infered_output_dist_attrs = result_dist_attrs[1]
 
         self.assertEqual(len(infered_input_dist_attrs), 1)
-        self.assertEqual(len(infered_output_dist_attrs), 1)
+        self.assertEqual(len(infered_output_dist_attrs), 2)
         self.assertEqual(
             infered_input_dist_attrs[0].dims_mapping, [0, -1, -1, 1]
         )
         self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0, -1, 1])
+        self.assertEqual(
+            infered_output_dist_attrs[1].dims_mapping, [-1, 0, -1, -1, 1]
+        )
 
         # shape: [8, 16, 8, 24] --> [8, 16 * 8, 24]
-        # dims_mapping: [-1, 0, -1, 1] --> [-1, 0, -1, 1] [ -1, 0, 1]
+        # dims_mapping: [-1, 0, -1, 1] --> [-1, 0, -1, 1] ([ -1, 0, 1], [-1, -1, 0, -1, 1] // xshape)
         self.x_dist_tensor_spec.set_dims_mapping([-1, 0, -1, 1])
         self.attrs['start_axis'] = 1
         self.attrs['stop_axis'] = 2
@@ -74,9 +77,12 @@ def test_flatten_infer_forward(self):
             infered_input_dist_attrs[0].dims_mapping, [-1, 0, -1, 1]
         )
         self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [-1, 0, 1])
+        self.assertEqual(
+            infered_output_dist_attrs[1].dims_mapping, [-1, -1, 0, -1, 1]
+        )
 
         # shape: [8, 16, 8, 24] --> [8, 16 * 8, 24]
-        # dims_mapping: [-1, -1, 1, 0] --> [-1, -1, -1, 0] [ -1, -1, 0]
+        # dims_mapping: [-1, -1, 1, 0] --> [-1, -1, -1, 0] ([ -1, -1, 0], [-1, -1, -1, -1, 0] // xshape)
         self.x_dist_tensor_spec.set_dims_mapping([-1, -1, 1, 0])
         self.attrs['start_axis'] = 1
         self.attrs['stop_axis'] = 2
@@ -92,9 +98,12 @@ def test_flatten_infer_forward(self):
             infered_input_dist_attrs[0].dims_mapping, [-1, -1, -1, 0]
         )
         self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [-1, -1, 0])
+        self.assertEqual(
+            infered_output_dist_attrs[1].dims_mapping, [-1, -1, -1, -1, 0]
+        )
 
         # shape: [8, 16, 8, 24] --> [8 * 16 * 8 * 24]
-        # dims_mapping: [-1, 0, 1, -1] --> [-1, -1, -1, -1] [ -1]
+        # dims_mapping: [-1, 0, 1, -1] --> [-1, -1, -1, -1] ([ -1], [-1, -1, -1, -1, -1] // xshape)
         self.x_dist_tensor_spec.set_dims_mapping([-1, 0, 1, -1])
         self.attrs['start_axis'] = 0
         self.attrs['stop_axis'] = -1
@@ -110,9 +119,12 @@ def test_flatten_infer_forward(self):
             infered_input_dist_attrs[0].dims_mapping, [-1, -1, -1, -1]
         )
         self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [-1])
+        self.assertEqual(
+            infered_output_dist_attrs[1].dims_mapping, [-1, -1, -1, -1, -1]
+        )
 
         # shape: [8, 16, 8, 24] --> [8 * 16 * 8 * 24]
-        # dims_mapping: [0, -1, -1, 1] --> [0, -1, -1, -1] [ 0]
+        # dims_mapping: [0, -1, -1, 1] --> [0, -1, -1, -1] ([ 0], [-1, 0, -1, -1, -1] // xshape)
         self.x_dist_tensor_spec.set_dims_mapping([0, -1, -1, 1])
         self.attrs['start_axis'] = 0
         self.attrs['stop_axis'] = -1
@@ -128,9 +140,12 @@ def test_flatten_infer_forward(self):
             infered_input_dist_attrs[0].dims_mapping, [0, -1, -1, -1]
         )
         self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0])
+        self.assertEqual(
+            infered_output_dist_attrs[1].dims_mapping, [-1, 0, -1, -1, -1]
+        )
 
         # shape: [8, 16, 8, 24] --> [8 * 16 * 8 * 24]
-        # dims_mapping: [1, 0, -1, -1] --> [1, -1, -1, -1] [ 1]
+        # dims_mapping: [1, 0, -1, -1] --> [1, -1, -1, -1] ([ 1], [-1, 1, -1, -1, -1] // xshape)
         self.x_dist_tensor_spec.set_dims_mapping([1, 0, -1, -1])
         self.attrs['start_axis'] = 0
         self.attrs['stop_axis'] = -1
@@ -146,9 +161,12 @@ def test_flatten_infer_forward(self):
             infered_input_dist_attrs[0].dims_mapping, [1, -1, -1, -1]
         )
         self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [1])
+        self.assertEqual(
+            infered_output_dist_attrs[1].dims_mapping, [-1, 1, -1, -1, -1]
+        )
 
         # shape: [8, 16, 8, 24] --> [8, 16 * 8 * 24]
-        # dims_mapping: [-1, -1, 0, 1] --> [-1, -1, -1, -1] [-1, -1]
+        # dims_mapping: [-1, -1, 0, 1] --> [-1, -1, -1, -1] ([-1, -1], [-1, -1, -1, -1, -1] // xshape)
         self.x_dist_tensor_spec.set_dims_mapping([-1, -1, 0, 1])
         self.attrs['start_axis'] = 1
         self.attrs['stop_axis'] = -1
@@ -164,9 +182,12 @@ def test_flatten_infer_forward(self):
             infered_input_dist_attrs[0].dims_mapping, [-1, -1, -1, -1]
         )
         self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [-1, -1])
+        self.assertEqual(
+            infered_output_dist_attrs[1].dims_mapping, [-1, -1, -1, -1, -1]
+        )
 
         # shape: [8, 16, 8, 24] --> [8, 16 * 8 * 24]
-        # dims_mapping: [-1, 0, -1, 1] --> [-1, 0, -1, -1] [-1, 0]
+        # dims_mapping: [-1, 0, -1, 1] --> [-1, 0, -1, -1] ([-1, 0], [-1, -1, 0, -1, -1] // xshape)
         self.x_dist_tensor_spec.set_dims_mapping([-1, 0, -1, 1])
         self.attrs['start_axis'] = 1
         self.attrs['stop_axis'] = -1
@@ -182,9 +203,12 @@ def test_flatten_infer_forward(self):
             infered_input_dist_attrs[0].dims_mapping, [-1, 0, -1, -1]
         )
         self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [-1, 0])
+        self.assertEqual(
+            infered_output_dist_attrs[1].dims_mapping, [-1, -1, 0, -1, -1]
+        )
 
         # shape: [8, 16, 8, 24] --> [8, 16 * 8 * 24]
-        # dims_mapping: [0, 1, -1, -1] --> [0, 1, -1, -1] [0, 1]
+        # dims_mapping: [0, 1, -1, -1] --> [0, 1, -1, -1] ([0, 1], [-1, 0, 1, -1, -1] // xshape)
         self.x_dist_tensor_spec.set_dims_mapping([0, 1, -1, -1])
         self.attrs['start_axis'] = 1
         self.attrs['stop_axis'] = -1
@@ -200,6 +224,9 @@ def test_flatten_infer_forward(self):
             infered_input_dist_attrs[0].dims_mapping, [0, 1, -1, -1]
         )
         self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0, 1])
+        self.assertEqual(
+            infered_output_dist_attrs[1].dims_mapping, [-1, 0, 1, -1, -1]
+        )
 
     def test_flatten_infer_backward(self):
         process_mesh = auto.ProcessMesh(mesh=[[0, 1, 2, 3], [4, 5, 6, 7]])
diff --git a/test/cpp/auto_parallel/spmd_rule_test.cc b/test/cpp/auto_parallel/spmd_rule_test.cc
index 0ae0b8ed3eaf1..d908261840cfc 100644
--- a/test/cpp/auto_parallel/spmd_rule_test.cc
+++ b/test/cpp/auto_parallel/spmd_rule_test.cc
@@ -1853,6 +1853,70 @@ TEST(CumSumGradInferSpmd, Ctor) {
             std::vector<int64_t>({-1, -1, -1}));
 }
 
+TEST(Flatten, Ctor) {
+  std::vector<int64_t> mesh_shape = {2, 2};
+  std::vector<int64_t> process_ids = {0, 1, 2, 3};
+  std::vector<std::string> dim_names = {"x", "y"};
+  ProcessMesh process_mesh(mesh_shape, process_ids, dim_names);
+
+  auto build_input = [&](const std::vector<int64_t>& shape,
+                         const std::vector<int64_t>& dim_mapping) {
+    auto t_dist_attr = TensorDistAttr();
+    t_dist_attr.set_process_mesh(process_mesh);
+    t_dist_attr.set_dims_mapping(dim_mapping);
+    t_dist_attr.set_dynamic_dims(std::vector<bool>(shape.size(), false));
+    auto input =
+        phi::distributed::DistMetaTensor(common::make_ddim(shape), t_dist_attr);
+    return input;
+  };
+
+  // [b, h/ph, w/pw, c, ph, pw]; dp
+  auto input1 = build_input({4, 16, 16, 4, 2, 2}, {0, -1, -1, -1, -1, -1});
+  // [b, h/ph, w/pw, c, ph, pw] => [b, h/ph, w/pw, hidden_size]
+  auto spmd1 = FlattenInferSpmd(input1, -3, -1);
+  EXPECT_EQ(spmd1.first.size(), static_cast<size_t>(1));
+  EXPECT_EQ(spmd1.second.size(), static_cast<size_t>(2));
+  check_dim_mapping(spmd1.first[0], {0, -1, -1, -1, -1, -1});
+  check_dim_mapping(spmd1.second[0], {0, -1, -1, -1});
+  check_dim_mapping(spmd1.second[1], {-1, 0, -1, -1, -1, -1, -1});  // x_shape
+
+  // [b, h/ph, w/pw, c, ph, pw]; dp, mp
+  auto input2 = build_input({4, 16, 16, 4, 2, 2}, {-1, 0, -1, 1, -1, -1});
+  auto spmd2 = FlattenInferSpmd(input2, 1, 4);
+  EXPECT_EQ(spmd2.first.size(), static_cast<size_t>(1));
+  EXPECT_EQ(spmd2.second.size(), static_cast<size_t>(2));
+  check_dim_mapping(spmd2.first[0], {-1, 0, -1, -1, -1, -1});
+  check_dim_mapping(spmd2.second[0], {-1, 0, -1});
+  check_dim_mapping(spmd2.second[1], {-1, -1, 0, -1, -1, -1, -1});  // x_shape
+
+  // [b, s, nh, h/nh]; dp , mp
+  auto input3 = build_input({2, 1024, 32, 32}, {0, -1, 1, -1});
+  // [b, s, nh, h/nh] => [b, s, h]
+  auto spmd3 = FlattenInferSpmd(input3, 2, 3);
+  EXPECT_EQ(spmd3.first.size(), static_cast<size_t>(1));
+  EXPECT_EQ(spmd3.second.size(), static_cast<size_t>(2));
+  check_dim_mapping(spmd3.first[0], {0, -1, 1, -1});
+  check_dim_mapping(spmd3.second[0], {0, -1, 1});
+  check_dim_mapping(spmd3.second[1], {-1, 0, -1, 1, -1});  // x_shape
+
+  // [b, c, d, h, w]; dp, mp
+  auto input4 = build_input({4, 16, 16, 4, 16}, {-1, -1, 0, 1, -1});
+  auto spmd4 = FlattenInferSpmd(input4, 1, 4);
+  EXPECT_EQ(spmd4.first.size(), static_cast<size_t>(1));
+  EXPECT_EQ(spmd4.second.size(), static_cast<size_t>(2));
+  check_dim_mapping(spmd4.first[0], {-1, -1, -1, -1, -1});
+  check_dim_mapping(spmd4.second[0], {-1, -1});
+  check_dim_mapping(spmd4.second[1], {-1, -1, -1, -1, -1, -1});  // x_shape
+
+  auto out_grad = build_input({2, 1024, 1024}, {-1, -1, -1});
+  auto xshape = build_input({0, 2, 1024, 4, 1024 / 4}, {-1, 0, 1, -1, -1});
+  auto spmd_grad = FlattenGradInferSpmd(xshape, out_grad);
+  EXPECT_EQ(spmd_grad.first.size(), static_cast<size_t>(1));
+  EXPECT_EQ(spmd_grad.second.size(), static_cast<size_t>(1));
+  check_dim_mapping(spmd_grad.first[0], {0, 1, -1});
+  check_dim_mapping(spmd_grad.second[0], {0, 1, -1, -1});
+}
+
 }  // namespace auto_parallel
 }  // namespace distributed
 }  // namespace paddle
diff --git a/test/cpp/pir/cinn/compilation_task_test.cc b/test/cpp/pir/cinn/compilation_task_test.cc
index 254ab7c4baf8a..3fbe4ed4ba60b 100644
--- a/test/cpp/pir/cinn/compilation_task_test.cc
+++ b/test/cpp/pir/cinn/compilation_task_test.cc
@@ -24,6 +24,7 @@
 #include "paddle/cinn/hlir/dialect/operator/ir/op_attribute.h"
 #include "paddle/cinn/hlir/dialect/operator/ir/op_dialect.h"
 #include "paddle/cinn/hlir/framework/pir/compilation_task.h"
+#include "paddle/cinn/hlir/framework/pir/utils.h"
 #include "paddle/cinn/hlir/framework/pir_compiler.h"
 #include "paddle/cinn/utils/data_util.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
@@ -34,6 +35,7 @@
 
 PD_DECLARE_bool(cinn_bucket_compile);
 
+using cinn::hlir::framework::pir::CompatibleInfo;
 using cinn::hlir::framework::pir::OpLoweringGroup;
 using cinn::hlir::framework::pir::OpLoweringGroupPtr;
 
@@ -50,8 +52,11 @@ ProgramInfo BuildProgram(std::vector<int64_t> input_shape) {
       input_shape, value_one, phi::DataType::FLOAT32, phi::GPUPlace());
 
   std::vector<OpLoweringGroupPtr> groups;
+  const std::string fn_name = CompatibleInfo::GroupOpsName(
+      std::initializer_list<::pir::Operation*>({full_op_x.operation()}));
   groups.emplace_back(std::make_shared<OpLoweringGroup>(
-      std::initializer_list<::pir::Operation*>({full_op_x.operation()})));
+      std::initializer_list<::pir::Operation*>({full_op_x.operation()}),
+      fn_name));
   groups.back()->mut_output_ops().insert(full_op_x.operation());
 
   return {program, groups};
diff --git a/test/cpp/pir/cinn/file_tile_config_test.cc b/test/cpp/pir/cinn/file_tile_config_test.cc
index 3cdcc7a390bbe..d863baca924f7 100644
--- a/test/cpp/pir/cinn/file_tile_config_test.cc
+++ b/test/cpp/pir/cinn/file_tile_config_test.cc
@@ -39,7 +39,7 @@ TEST(ConfigSearcher, TestReduceDemo) {
   constexpr int kMaxThreadsPerBlock = 1024;
 
   // Step 1: Construct iter space and tile config.
-  cinn::ir::search::IterSpace iter_space;
+  cinn::ir::BucketInfo bucket_info;
   int s_dimension_lower = 32;
   int s_dimension_upper = 128;
   auto s_dimension_type = "S";
@@ -49,61 +49,52 @@ TEST(ConfigSearcher, TestReduceDemo) {
   auto r_dimension_type = "R";
   auto r_dimension_is_dynamic = true;
 
-  iter_space.space.push_back(cinn::ir::search::IterSpace::Dimension{
-      s_dimension_lower,
-      s_dimension_upper,
-      s_dimension_type,
-      s_dimension_is_dynamic,
-      std::vector<double>(128 - 32, 1.0)});
-  iter_space.space.push_back(
-      cinn::ir::search::IterSpace::Dimension{r_dimension_lower,
-                                             r_dimension_upper,
-                                             r_dimension_type,
-                                             r_dimension_is_dynamic,
-                                             std::vector<double>(1, 1.0)});
-  cinn::ir::BucketInfo bucket_info;
-  bucket_info.sp_lower_bound = iter_space.space[0].lower_bound;
-  bucket_info.sp_upper_bound = iter_space.space[0].upper_bound;
-  bucket_info.rb_lower_bound = iter_space.space[1].lower_bound;
-  bucket_info.rb_upper_bound = iter_space.space[1].upper_bound;
+  bucket_info.space.push_back(
+      cinn::ir::BucketInfo::Dimension{s_dimension_lower,
+                                      s_dimension_upper,
+                                      s_dimension_type,
+                                      s_dimension_is_dynamic,
+                                      std::vector<double>(128 - 32, 1.0)});
+  bucket_info.space.push_back(
+      cinn::ir::BucketInfo::Dimension{r_dimension_lower,
+                                      r_dimension_upper,
+                                      r_dimension_type,
+                                      r_dimension_is_dynamic,
+                                      std::vector<double>(1, 1.0)});
+
   cinn::ir::ScheduleConfig::TileConfig tile_config;
   tile_config.spatial_inner_num = 32;
   tile_config.warp_num = 32;
   tile_config.tree_reduce_num = 128;
   std::vector<std::pair<std::string, std::string>> iter_space_type = {
-      std::make_pair("R", "dynamic"), std::make_pair("S", "dynamic")};
+      std::make_pair("S", "dynamic"), std::make_pair("R", "dynamic")};
   // Step 2: Add to json/Read from json
   cinn::ir::FileTileConfigDatabase file_database;
-  file_database.AddConfig(cinn::common::DefaultTarget(),
-                          iter_space_type,
-                          bucket_info,
-                          tile_config,
-                          2);
+  file_database.AddConfig(
+      cinn::common::DefaultTarget(), bucket_info, tile_config, 2);
   cinn::ir::TileConfigMap tile_config_map =
       file_database.GetConfigs(cinn::common::DefaultTarget(), iter_space_type);
   for (auto& it : tile_config_map) {
-    LOG(INFO) << "sp_lower_bound is " << it.first.sp_lower_bound;
-    LOG(INFO) << "sp_upper_bound is " << it.first.sp_upper_bound;
-    LOG(INFO) << "rb_lower_bound is " << it.first.rb_lower_bound;
-    LOG(INFO) << "rb_upper_bound is " << it.first.rb_upper_bound;
+    LOG(INFO) << "bucket info is: ";
+    auto dims = it.first.space.size();
+    for (int i = 0; i < dims; i++) {
+      LOG(INFO) << "Dimension " << i
+                << " 's lower_bound is: " << it.first.space[i].lower_bound;
+      LOG(INFO) << "Dimension " << i
+                << " 's upper_bound is: " << it.first.space[i].upper_bound;
+      auto dimension_lower = i == 0 ? s_dimension_lower : r_dimension_lower;
+      auto dimension_upper = i == 0 ? s_dimension_upper : r_dimension_upper;
+      PADDLE_ENFORCE_EQ(it.first.space[i].lower_bound,
+                        dimension_lower,
+                        ::common::errors::InvalidArgument(
+                            "GetConfigs function gets wrong dimension_lower"));
+      PADDLE_ENFORCE_EQ(it.first.space[i].upper_bound,
+                        dimension_upper,
+                        ::common::errors::InvalidArgument(
+                            "GetConfigs function gets wrong dimension_upper"));
+    }
     LOG(INFO) << "tile config is " << it.second.spatial_inner_num << " "
               << it.second.warp_num << " " << it.second.tree_reduce_num;
-    PADDLE_ENFORCE_EQ(it.first.sp_lower_bound,
-                      s_dimension_lower,
-                      ::common::errors::InvalidArgument(
-                          "GetConfigs function gets wrong s_dimension_lower"));
-    PADDLE_ENFORCE_EQ(it.first.sp_upper_bound,
-                      s_dimension_upper,
-                      ::common::errors::InvalidArgument(
-                          "GetConfigs function gets wrong s_dimension_upper"));
-    PADDLE_ENFORCE_EQ(it.first.rb_lower_bound,
-                      r_dimension_lower,
-                      ::common::errors::InvalidArgument(
-                          "GetConfigs function gets wrong r_dimension_lower"));
-    PADDLE_ENFORCE_EQ(it.first.rb_upper_bound,
-                      r_dimension_upper,
-                      ::common::errors::InvalidArgument(
-                          "GetConfigs function gets wrong r_dimension_upprt"));
     PADDLE_ENFORCE_EQ(it.second.spatial_inner_num,
                       tile_config.spatial_inner_num,
                       ::common::errors::InvalidArgument(
diff --git a/test/cpp/pir/cinn/pir_compiler_test.cc b/test/cpp/pir/cinn/pir_compiler_test.cc
index 8e2df8e02ac8c..622a4fec701f1 100644
--- a/test/cpp/pir/cinn/pir_compiler_test.cc
+++ b/test/cpp/pir/cinn/pir_compiler_test.cc
@@ -25,6 +25,7 @@
 #include "paddle/cinn/hlir/dialect/operator/ir/op_dialect.h"
 #include "paddle/cinn/hlir/dialect/runtime/ir/jit_kernel_op.h"
 #include "paddle/cinn/hlir/dialect/runtime/ir/runtime_dialect.h"
+#include "paddle/cinn/hlir/framework/pir/utils.h"
 #include "paddle/cinn/hlir/framework/pir_compiler.h"
 #include "paddle/cinn/utils/data_util.h"
 #include "paddle/fluid/framework/new_executor/interpretercore.h"
@@ -38,6 +39,7 @@
 #include "paddle/pir/include/core/program.h"
 #include "paddle/pir/include/dialect/control_flow/ir/cf_op.h"
 
+using cinn::hlir::framework::pir::CompatibleInfo;
 using cinn::hlir::framework::pir::OpLoweringGroup;
 using cinn::hlir::framework::pir::OpLoweringGroupPtr;
 
@@ -74,18 +76,26 @@ ProgramInfo BuildProgram() {
   builder.Build<pir::YieldOp>(std::vector<pir::Value>{relu_op_y.result(0)});
 
   std::vector<OpLoweringGroupPtr> groups;
+  const auto full_op_x_ops =
+      std::initializer_list<::pir::Operation*>({full_op_x.operation()});
   groups.emplace_back(std::make_shared<OpLoweringGroup>(
-      std::initializer_list<::pir::Operation*>(
-          {full_op_x.operation()})));  // For coverage
+      full_op_x_ops,
+      CompatibleInfo::GroupOpsName(full_op_x_ops)));  // For coverage
   groups[0]->mut_output_values().push_back(groups[0]->ops().back()->result(0));
+
+  const auto full_op_y_ops =
+      std::initializer_list<::pir::Operation*>({full_op_x.operation()});
   groups.emplace_back(std::make_shared<OpLoweringGroup>(
-      std::initializer_list<::pir::Operation*>({full_op_y.operation()})));
+      full_op_y_ops, CompatibleInfo::GroupOpsName(full_op_y_ops)));
+
   groups[1]->mut_output_values().push_back(groups[1]->ops().back()->result(0));
-  groups.emplace_back(std::make_shared<OpLoweringGroup>(
+  const auto vector_ops =
       std::vector<::pir::Operation*>({tan_op_x.operation(),
                                       relu_op_x.operation(),
                                       tan_op_y.operation(),
-                                      relu_op_y.operation()})));
+                                      relu_op_y.operation()});
+  groups.emplace_back(std::make_shared<OpLoweringGroup>(
+      vector_ops, CompatibleInfo::GroupOpsName(vector_ops)));
   groups[2]->mut_output_values().push_back(groups[2]->ops().back()->result(0));
 
   return {program, groups};
@@ -127,14 +137,16 @@ ProgramInfo BuildSoftmax() {
   auto yield_op = builder.Build<pir::YieldOp>(std::vector<pir::Value>{divide});
 
   std::vector<OpLoweringGroupPtr> groups;
-  groups.emplace_back(std::make_shared<OpLoweringGroup>(
+  const auto vector_ops =
       std::initializer_list<::pir::Operation*>({max.defining_op(),
                                                 broadcast_1.defining_op(),
                                                 sub.defining_op(),
                                                 exp.defining_op(),
                                                 sum.defining_op(),
                                                 broadcast_2.defining_op(),
-                                                divide.defining_op()})));
+                                                divide.defining_op()});
+  groups.emplace_back(std::make_shared<OpLoweringGroup>(
+      vector_ops, CompatibleInfo::GroupOpsName(vector_ops)));
   groups[0]->mut_output_values().push_back(groups[0]->ops().back()->result(0));
   groups[0]->set_op_pattern_kind(cinn::hlir::framework::kReduction);
 
diff --git a/test/cpp/pir/cinn/symbolic_lower_test.cc b/test/cpp/pir/cinn/symbolic_lower_test.cc
index 83de069dd622e..0c748d9b96da8 100644
--- a/test/cpp/pir/cinn/symbolic_lower_test.cc
+++ b/test/cpp/pir/cinn/symbolic_lower_test.cc
@@ -24,6 +24,7 @@
 #include "paddle/cinn/hlir/framework/pir/group.h"
 #include "paddle/cinn/hlir/framework/pir/op_lowering_group.h"
 #include "paddle/cinn/hlir/framework/pir/op_lowering_impl.h"
+#include "paddle/cinn/hlir/framework/pir/utils.h"
 #include "paddle/cinn/hlir/framework/pir_compiler.h"
 #include "paddle/common/ddim.h"
 #include "paddle/fluid/framework/new_executor/interpretercore.h"
@@ -39,6 +40,7 @@
 
 PD_DECLARE_bool(cinn_bucket_compile);
 
+using cinn::hlir::framework::pir::CompatibleInfo;
 using cinn::hlir::framework::pir::OpLoweringGroup;
 using cinn::hlir::framework::pir::OpLoweringGroupPtr;
 
@@ -88,9 +90,11 @@ BuildGroupProgramForLowering() {
   builder.Build<paddle::dialect::FetchOp>(group_op->result(0), "out", 0);
 
   std::vector<OpLoweringGroupPtr> groups;
-  groups.emplace_back(
-      std::make_shared<OpLoweringGroup>(std::vector<::pir::Operation*>(
-          {exp.operation(), reshape.operation(), sub.operation()})));
+  groups.emplace_back(std::make_shared<OpLoweringGroup>(
+      std::vector<::pir::Operation*>(
+          {exp.operation(), reshape.operation(), sub.operation()}),
+      CompatibleInfo::GroupOpsName(std::vector<::pir::Operation*>(
+          {exp.operation(), reshape.operation(), sub.operation()}))));
   groups[0]->mut_output_ops().insert(groups[0]->ops().back());
   std::unordered_map<::pir::Value, symbol::ShapeOrDataDimExprs>
       value_to_shape_data;
@@ -176,9 +180,11 @@ BuildBroadcastGroupProgramForLowering() {
   builder.Build<paddle::dialect::FetchOp>(group_op->result(0), "out", 0);
 
   std::vector<OpLoweringGroupPtr> groups;
-  groups.emplace_back(
-      std::make_shared<OpLoweringGroup>(std::vector<::pir::Operation*>(
-          {x_broadcast.operation(), sub.operation()})));
+  groups.emplace_back(std::make_shared<OpLoweringGroup>(
+      std::vector<::pir::Operation*>(
+          {x_broadcast.operation(), sub.operation()}),
+      CompatibleInfo::GroupOpsName(std::vector<::pir::Operation*>(
+          {x_broadcast.operation(), sub.operation()}))));
   groups[0]->mut_output_ops().insert(groups[0]->ops().back());
 
   std::unordered_map<::pir::Value, symbol::ShapeOrDataDimExprs>
diff --git a/test/cpp/pir/cinn/tile_config_searcher_test.cc b/test/cpp/pir/cinn/tile_config_searcher_test.cc
index f54aa848b655a..289113a96bbab 100644
--- a/test/cpp/pir/cinn/tile_config_searcher_test.cc
+++ b/test/cpp/pir/cinn/tile_config_searcher_test.cc
@@ -66,22 +66,22 @@ TEST(ConfigSearcher, TestReduceDemo) {
   schedule_config_manager.SetPolicy("custom");
 
   // Step 3: Construct iter space and objective function.
-  cinn::ir::search::IterSpace iter_space;
-  iter_space.space.push_back(cinn::ir::search::IterSpace::Dimension{
-      33,
-      128,
-      "S",
-      /* is_dynamic = */ true,
-      std::vector<double>(128 - 32, 1.0)});
-  iter_space.space.push_back(
-      cinn::ir::search::IterSpace::Dimension{1024,
-                                             1024,
-                                             "R",
-                                             /* is_dynamic = */ false,
-                                             std::vector<double>(1, 1.0)});
+  cinn::ir::BucketInfo bucket_info;
+  bucket_info.space.push_back(
+      cinn::ir::BucketInfo::Dimension{33,
+                                      128,
+                                      "S",
+                                      /* is_dynamic = */ true,
+                                      std::vector<double>(128 - 32, 1.0)});
+  bucket_info.space.push_back(
+      cinn::ir::BucketInfo::Dimension{1024,
+                                      1024,
+                                      "R",
+                                      /* is_dynamic = */ false,
+                                      std::vector<double>(1, 1.0)});
   std::unique_ptr<cinn::ir::search::BaseObjectiveFunc> obj_func =
       std::make_unique<cinn::ir::search::WeightedSamplingTrailObjectiveFunc>(
-          program.get(), iter_space);
+          program.get(), bucket_info);
 
   // Step 4: Construct config candidate range and constraints.
   std::vector<std::pair<int, int>> candidate_range{
diff --git a/test/cpp/pir/pattern_rewrite/pattern_rewrite_test.cc b/test/cpp/pir/pattern_rewrite/pattern_rewrite_test.cc
index 9ec1928ef10ff..a7674d60451cd 100644
--- a/test/cpp/pir/pattern_rewrite/pattern_rewrite_test.cc
+++ b/test/cpp/pir/pattern_rewrite/pattern_rewrite_test.cc
@@ -426,7 +426,7 @@ TEST(pattern_rewrite, Patterns) {
   //     true));
 
   CHECK_EQ(pm.Run(&program), true);
-  EXPECT_EQ(program.block()->size(), 19u);
+  EXPECT_EQ(program.block()->size(), 17u);
 }
 
 void BuildConstantFoldingProgram(pir::Program *program,
diff --git a/test/deprecated/legacy_test/CMakeLists.txt b/test/deprecated/legacy_test/CMakeLists.txt
index 18891bc1cb65e..12d4734020e2c 100644
--- a/test/deprecated/legacy_test/CMakeLists.txt
+++ b/test/deprecated/legacy_test/CMakeLists.txt
@@ -130,7 +130,6 @@ endif()
 
 if(WIN32)
   list(REMOVE_ITEM TEST_OPS test_complex_matmul)
-  list(REMOVE_ITEM TEST_OPS test_ops_nms)
   list(REMOVE_ITEM TEST_OPS test_trt_convert_preln_residual_bias)
   list(REMOVE_ITEM TEST_OPS test_masked_multihead_attention_op)
   list(REMOVE_ITEM TEST_OPS test_fused_ec_moe_op)
@@ -401,8 +400,6 @@ endfunction()
 list(REMOVE_ITEM TEST_OPS test_feed_data_check_shape_type)
 list(REMOVE_ITEM TEST_OPS test_fetch_lod_tensor_array)
 list(REMOVE_ITEM TEST_OPS test_data_norm_op)
-list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op)
-list(REMOVE_ITEM TEST_OPS test_nearest_interp_op)
 list(REMOVE_ITEM TEST_OPS test_imperative_mnist_sorted_gradient)
 list(REMOVE_ITEM TEST_OPS test_imperative_mnist)
 list(REMOVE_ITEM TEST_OPS test_layers_deprecated)
@@ -452,8 +449,7 @@ endif()
 
 # Some ops need to check results when gc is enabled
 # Currently, only ops that register NoNeedBufferVarsInference need to do this test
-set(TEST_OPS_WITH_GC test_affine_channel_op test_gather_nd_op test_scatter_op
-                     test_slice_op)
+set(TEST_OPS_WITH_GC test_gather_nd_op test_slice_op)
 
 foreach(TEST_OP ${TEST_OPS_WITH_GC})
   list(REMOVE_ITEM TEST_OPS ${TEST_OP})
@@ -485,10 +481,6 @@ set_tests_properties(test_logcumsumexp_op PROPERTIES TIMEOUT 30)
 py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS
                 FLAGS_inner_op_parallelism=4)
 
-py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op ENVS
-                ${GC_ENVS})
-py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op ENVS
-                ${GC_ENVS})
 py_test_modules(test_imperative_mnist MODULES test_imperative_mnist ENVS
                 FLAGS_cudnn_deterministic=1)
 py_test_modules(
@@ -511,8 +503,6 @@ if((WITH_GPU) AND (WITH_CUDNN_FRONTEND))
                   test_fused_dot_product_attention_op)
 endif()
 
-set_tests_properties(test_conv2d_op_depthwise_conv
-                     PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
 set_tests_properties(test_conv2d_api_deprecated PROPERTIES LABELS
                                                            "RUN_TYPE=EXCLUSIVE")
 set_tests_properties(test_conv_nn_grad PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
@@ -644,17 +634,10 @@ endif()
 set_tests_properties(test_cross_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_lod_tensor_to_selected_rows
                      PROPERTIES TIMEOUT 200)
-set_tests_properties(test_lstm_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_imperative_star_gan_with_gradient_penalty
-                     PROPERTIES TIMEOUT 120)
 
-set_tests_properties(test_bicubic_interp_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_deformable_conv_op PROPERTIES TIMEOUT 200)
-set_tests_properties(test_nearest_interp_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_inplace_softmax_with_cross_entropy PROPERTIES TIMEOUT
                                                                         120)
-set_tests_properties(test_cross_entropy2_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_gru_unit_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_regularizer_api PROPERTIES TIMEOUT 150)
 if(NOT WIN32)
   if(WITH_NV_JETSON)
@@ -666,75 +649,45 @@ set_tests_properties(test_bilateral_slice_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_fleet_util PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_transformer_sorted_gradient
                      PROPERTIES TIMEOUT 120)
-set_tests_properties(test_matmul_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_nearest_interp_v2_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_trilinear_interp_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_static_save_load PROPERTIES TIMEOUT 250)
-set_tests_properties(test_paddle_save_load_binary PROPERTIES TIMEOUT 120)
-if(WIN32)
-  set_tests_properties(test_static_save_load_large PROPERTIES TIMEOUT 900)
-else()
-  set_tests_properties(test_static_save_load_large PROPERTIES TIMEOUT 600)
-endif()
 if(WITH_NV_JETSON)
   set_tests_properties(test_conv3d_transpose_part2_op PROPERTIES TIMEOUT 1200)
   set_tests_properties(test_layer_norm_op PROPERTIES TIMEOUT 1500)
-  set_tests_properties(test_pool3d_op PROPERTIES TIMEOUT 1500)
 else()
   set_tests_properties(test_conv3d_transpose_part2_op PROPERTIES TIMEOUT 120)
   set_tests_properties(test_layer_norm_op PROPERTIES TIMEOUT 250)
-  set_tests_properties(test_pool3d_op PROPERTIES TIMEOUT 150)
 endif()
 
 set_tests_properties(test_imperative_selected_rows_to_lod_tensor
                      PROPERTIES TIMEOUT 200)
-set_tests_properties(test_index_select_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_index_add_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_argsort_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_gather_nd_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_row_conv_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_deformable_conv_v1_op PROPERTIES TIMEOUT 300)
 set_tests_properties(test_imperative_ptb_rnn_sorted_gradient PROPERTIES TIMEOUT
                                                                         120)
-set_tests_properties(test_crop_tensor_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_ptb_rnn PROPERTIES TIMEOUT 120)
-set_tests_properties(test_svd_op PROPERTIES TIMEOUT 80)
-set_tests_properties(test_qr_op PROPERTIES TIMEOUT 60)
-set_tests_properties(test_trilinear_interp_v2_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_masked_select_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_sigmoid_cross_entropy_with_logits_op
                      PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_optimizer_v2 PROPERTIES TIMEOUT 150)
-set_tests_properties(test_partial_sum_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_sgd_op PROPERTIES TIMEOUT 250)
 set_tests_properties(test_generator_dataloader_deprecated PROPERTIES TIMEOUT
                                                                      120)
-set_tests_properties(test_partial_concat_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_reduce_op PROPERTIES TIMEOUT 500)
 set_tests_properties(test_conv_nn_grad PROPERTIES TIMEOUT 220)
 set_tests_properties(test_program_prune_backward PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_optimizer_v2 PROPERTIES TIMEOUT 250)
-set_tests_properties(test_bilinear_interp_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_decoupled_py_reader PROPERTIES TIMEOUT 120)
 set_tests_properties(test_fuse_bn_act_pass PROPERTIES TIMEOUT 120)
-set_tests_properties(test_conv2d_op_depthwise_conv PROPERTIES TIMEOUT 120)
 set_tests_properties(test_conv2d_api_deprecated PROPERTIES TIMEOUT 120)
-set_tests_properties(test_elementwise_mul_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_dygraph_multi_forward PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_ocr_attention_model PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_mnist PROPERTIES TIMEOUT 120)
-set_tests_properties(test_gru_op PROPERTIES TIMEOUT 200)
 set_tests_properties(test_regularizer PROPERTIES TIMEOUT 150)
-set_tests_properties(test_matmul_v2_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_slice_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_pad3d_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_dataloader_keep_order PROPERTIES TIMEOUT 120)
 set_tests_properties(test_dataloader_unkeep_order PROPERTIES TIMEOUT 120)
 set_tests_properties(test_reader_reset PROPERTIES TIMEOUT 120)
-set_tests_properties(test_cumprod_op PROPERTIES TIMEOUT 300)
 set_tests_properties(test_split_program PROPERTIES TIMEOUT 120)
-set_tests_properties(test_graph_send_ue_recv_op PROPERTIES TIMEOUT 60)
-set_tests_properties(test_graph_send_uv_op PROPERTIES TIMEOUT 60)
 set_tests_properties(test_uniform_random_op_deprecated PROPERTIES TIMEOUT 60)
 
 set_tests_properties(test_pretrained_model PROPERTIES TIMEOUT 120)
@@ -756,36 +709,17 @@ set_tests_properties(test_inplace_addto_strategy_deprecated PROPERTIES TIMEOUT
 
 set(TEST_CINN_OPS
     test_softmax_op
-    test_expand_v2_op
     test_reduce_op
     test_slice_op
-    test_full_like_op
-    test_index_select_op
-    test_top_k_v2_op
-    test_elementwise_mul_op
     test_gather_nd_op
-    test_elementwise_pow_op
-    test_reshape_op
-    test_meshgrid_op
     test_scale_op
-    test_scatter_op
     test_layer_norm_op
-    test_cast_op
-    test_roll_op
-    test_atan2_op
-    test_top_k_op
     test_where_op
     test_arg_min_max_op
-    test_reverse_op
-    test_flip
-    test_triangular_solve_op
     test_scatter_nd_op
     test_instance_norm_op
     test_cumsum_op
-    test_split_op
-    test_erf_op
-    test_assign_op
-    test_flatten_contiguous_range_op)
+    test_erf_op)
 
 foreach(TEST_CINN_OP ${TEST_CINN_OPS})
   if(WITH_CINN)
@@ -810,16 +744,12 @@ set(STATIC_BUILD_TESTS
     test_batch_norm_op
     test_bincount_op
     test_decoupled_py_reader
-    test_eigh_op
     test_fetch_lod_tensor_array
     test_fuse_bn_act_pass
     test_layer_norm_op
     test_lookup_table_v2_op_deprecated
-    test_matmul_op
-    test_matmul_v2_op
     test_momentum_op
     test_nce
-    test_paddle_save_load_binary
     test_reduce_op
     test_sparse_conv_op
     test_sparse_norm_op
@@ -863,11 +793,7 @@ set_tests_properties(
     ENVIRONMENT
     "FLAGS_cudnn_deterministic=1;FLAGS_cudnn_batchnorm_spatial_persistent=1;FLAGS_conv_workspace_size_limit=1000"
 )
-set_tests_properties(test_matmul_op_static_build PROPERTIES TIMEOUT 120)
-set_tests_properties(test_matmul_v2_op_static_build PROPERTIES TIMEOUT 120)
 set_tests_properties(test_layer_norm_op_static_build PROPERTIES TIMEOUT 1500)
-set_tests_properties(test_paddle_save_load_binary_static_build
-                     PROPERTIES TIMEOUT 120)
 set_tests_properties(test_reduce_op_static_build PROPERTIES TIMEOUT 500)
 py_test_modules(test_stride MODULES test_stride ENVS
                 FLAGS_use_stride_kernel=true)
@@ -875,6 +801,5 @@ py_test_modules(test_stride MODULES test_stride ENVS
 set_tests_properties(test_linalg_matrix_exp PROPERTIES TIMEOUT 120)
 set_pir_tests_properties()
 
-set_tests_properties(test_fractional_max_pool2d_op PROPERTIES TIMEOUT 120)
-
 set_tests_properties(test_reduce_as_op PROPERTIES TIMEOUT 30)
+set_tests_properties(test_attribute_var_deprecated PROPERTIES TIMEOUT 100)
diff --git a/test/deprecated/legacy_test/test_adaptive_avg_pool2d.py b/test/deprecated/legacy_test/test_adaptive_avg_pool2d.py
index 880a7cf949a62..5ed16ca8675b1 100644
--- a/test/deprecated/legacy_test/test_adaptive_avg_pool2d.py
+++ b/test/deprecated/legacy_test/test_adaptive_avg_pool2d.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from test_attribute_var import UnittestBase
+from test_attribute_var_deprecated import UnittestBase
 
 import paddle
 from paddle.base import Program, core, program_guard
diff --git a/test/deprecated/legacy_test/test_arg_min_max_op.py b/test/deprecated/legacy_test/test_arg_min_max_op.py
index c35fa9f8f7d39..69b98997aeed5 100644
--- a/test/deprecated/legacy_test/test_arg_min_max_op.py
+++ b/test/deprecated/legacy_test/test_arg_min_max_op.py
@@ -17,7 +17,7 @@
 
 import numpy as np
 from op_test import OpTest, convert_float_to_uint16
-from test_attribute_var import UnittestBase
+from test_attribute_var_deprecated import UnittestBase
 
 import paddle
 from paddle.base import Program, program_guard
diff --git a/test/deprecated/legacy_test/test_attribute_var_deprecated.py b/test/deprecated/legacy_test/test_attribute_var_deprecated.py
new file mode 100644
index 0000000000000..5f09dff909395
--- /dev/null
+++ b/test/deprecated/legacy_test/test_attribute_var_deprecated.py
@@ -0,0 +1,107 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+import unittest
+
+import numpy as np
+
+import paddle
+import paddle.inference as paddle_infer
+from paddle.base.framework import Program, program_guard
+
+paddle.enable_static()
+
+
+class UnittestBase(unittest.TestCase):
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.init_info()
+
+    def tearDwon(self):
+        self.temp_dir.cleanup()
+
+    def init_info(self):
+        self.shapes = None
+        self.save_path = None
+
+    def path_prefix(self):
+        return type(self).__name__
+
+    def infer_prog(self):
+        config = paddle_infer.Config(
+            self.save_path + '.pdmodel', self.save_path + '.pdiparams'
+        )
+        config.disable_mkldnn()
+        predictor = paddle_infer.create_predictor(config)
+        input_names = predictor.get_input_names()
+        for i, shape in enumerate(self.shapes):
+            input_handle = predictor.get_input_handle(input_names[i])
+            self.fake_input = np.random.randn(*shape).astype("float32")
+            input_handle.reshape(shape)
+            input_handle.copy_from_cpu(self.fake_input)
+        predictor.run()
+        output_names = predictor.get_output_names()
+        res = []
+        for out_name in output_names:
+            output_handle = predictor.get_output_handle(out_name)
+            output_data = output_handle.copy_to_cpu()
+            res.append(output_data)
+
+        if len(output_names) == 1:
+            res = res[0]
+
+        return res
+
+
+class TestDropout(UnittestBase):
+    def init_info(self):
+        self.shapes = [[10, 10]]
+        self.save_path = os.path.join(self.temp_dir.name, 'dropout')
+
+    def test_static(self):
+        main_prog = Program()
+        startup_prog = Program()
+        with program_guard(main_prog, startup_prog):
+            fc = paddle.nn.Linear(10, 10)
+            x = paddle.randn(self.shapes[0])
+            x.stop_gradient = False
+            feat = fc(x)
+            # p is a Variable
+            p = paddle.randn([1])
+            out = paddle.nn.functional.dropout(feat, p=p)
+            sgd = paddle.optimizer.SGD()
+            sgd.minimize(paddle.mean(out))
+            # test _to_string
+            self.assertTrue("Var[" in str(main_prog))
+
+            exe = paddle.static.Executor()
+            exe.run(startup_prog)
+            res = exe.run(fetch_list=[x, out])
+            # export model
+            paddle.static.save_inference_model(self.save_path, [x], [out], exe)
+
+            # Test for Inference Predictor
+            infer_out = self.infer_prog()
+            self.assertEqual(infer_out.shape, (10, 10))
+
+            self.assertEqual(
+                main_prog.block(0).ops[4].all_attrs()['dropout_prob'].name,
+                p.name,
+            )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_eye_op.py b/test/deprecated/legacy_test/test_eye_op.py
index 41a4e6aea2f9d..cafbfbd96beb0 100644
--- a/test/deprecated/legacy_test/test_eye_op.py
+++ b/test/deprecated/legacy_test/test_eye_op.py
@@ -17,7 +17,7 @@
 
 import numpy as np
 from op_test import OpTest
-from test_attribute_var import UnittestBase
+from test_attribute_var_deprecated import UnittestBase
 
 import paddle
 from paddle import base
diff --git a/test/deprecated/legacy_test/test_inference_model_io.py b/test/deprecated/legacy_test/test_inference_model_io.py
index 2e179cf90276e..3b9d486e791e4 100644
--- a/test/deprecated/legacy_test/test_inference_model_io.py
+++ b/test/deprecated/legacy_test/test_inference_model_io.py
@@ -29,6 +29,7 @@
     load_inference_model_distributed,
     save_persistables,
 )
+from paddle.pir_utils import test_with_pir_api
 from paddle.static.io import load_inference_model, save_inference_model
 
 paddle.enable_static()
@@ -161,14 +162,15 @@ def test_fit_line_inference_model(self):
 
 
 class TestSaveInferenceModel(unittest.TestCase):
+    @test_with_pir_api
     def test_save_inference_model(self):
         root_path = tempfile.TemporaryDirectory()
         MODEL_DIR = os.path.join(root_path.name, "inference_model2")
-        init_program = Program()
-        program = Program()
+        init_program = paddle.static.Program()
+        program = paddle.static.Program()
 
         # fake program without feed/fetch
-        with program_guard(program, init_program):
+        with paddle.static.program_guard(program, init_program):
             x = paddle.static.data(name='x', shape=[-1, 2], dtype='float32')
             y = paddle.static.data(name='y', shape=[-1, 1], dtype='float32')
 
@@ -188,14 +190,15 @@ def test_save_inference_model(self):
         )
         root_path.cleanup()
 
+    @test_with_pir_api
     def test_save_inference_model_with_auc(self):
         root_path = tempfile.TemporaryDirectory()
         MODEL_DIR = os.path.join(root_path.name, "inference_model4")
-        init_program = Program()
-        program = Program()
+        init_program = paddle.static.Program()
+        program = paddle.static.Program()
 
         # fake program without feed/fetch
-        with program_guard(program, init_program):
+        with paddle.static.program_guard(program, init_program):
             x = paddle.static.data(name='x', shape=[-1, 2], dtype='float32')
             y = paddle.static.data(name='y', shape=[-1, 1], dtype='int32')
             predict = paddle.static.nn.fc(x, size=2, activation='softmax')
@@ -223,14 +226,15 @@ def test_save_inference_model_with_auc(self):
 
 
 class TestInstance(unittest.TestCase):
+    # @test_with_pir_api
     def test_save_inference_model(self):
         root_path = tempfile.TemporaryDirectory()
         MODEL_DIR = os.path.join(root_path.name, "inference_model3")
-        init_program = Program()
-        program = Program()
+        init_program = paddle.static.Program()
+        program = paddle.static.Program()
 
         # fake program without feed/fetch
-        with program_guard(program, init_program):
+        with paddle.static.program_guard(program, init_program):
             x = paddle.static.data(name='x', shape=[-1, 2], dtype='float32')
             y = paddle.static.data(name='y', shape=[-1, 1], dtype='float32')
 
@@ -261,14 +265,15 @@ def test_save_inference_model(self):
 
 
 class TestSaveInferenceModelNew(unittest.TestCase):
+    # @test_with_pir_api
     def test_save_and_load_inference_model(self):
         root_path = tempfile.TemporaryDirectory()
         MODEL_DIR = os.path.join(root_path.name, "inference_model5")
-        init_program = base.default_startup_program()
-        program = base.default_main_program()
+        init_program = paddle.static.default_startup_program()
+        program = paddle.static.default_main_program()
 
         # fake program without feed/fetch
-        with program_guard(program, init_program):
+        with paddle.static.program_guard(program, init_program):
             x = paddle.static.data(name='x', shape=[-1, 2], dtype='float32')
             y = paddle.static.data(name='y', shape=[-1, 1], dtype='float32')
 
@@ -283,7 +288,7 @@ def test_save_and_load_inference_model(self):
             sgd_optimizer.minimize(avg_cost, init_program)
 
         place = core.CPUPlace()
-        exe = executor.Executor(place)
+        exe = base.Executor(place)
         exe.run(init_program, feed={}, fetch_list=[])
 
         tensor_x = np.array([[1, 1], [1, 2], [5, 2]]).astype("float32")
@@ -344,7 +349,12 @@ def test_save_and_load_inference_model(self):
             exe,
         )
 
-        model_path = MODEL_DIR + "_isdir.pdmodel"
+        if paddle.framework.in_pir_mode():
+            MODEL_SUFFIX = ".json"
+        else:
+            MODEL_SUFFIX = ".pdmodel"
+
+        model_path = MODEL_DIR + "_isdir" + MODEL_SUFFIX
         os.makedirs(model_path)
         self.assertRaises(
             ValueError,
@@ -356,7 +366,7 @@ def test_save_and_load_inference_model(self):
         )
         os.rmdir(model_path)
 
-        params_path = MODEL_DIR + "_isdir.pdmodel"
+        params_path = MODEL_DIR + "_isdir" + MODEL_SUFFIX
         os.makedirs(params_path)
         self.assertRaises(
             ValueError,
@@ -372,7 +382,7 @@ def test_save_and_load_inference_model(self):
             MODEL_DIR, [x, y], [avg_cost], exe
         )
 
-        self.assertTrue(os.path.exists(MODEL_DIR + ".pdmodel"))
+        self.assertTrue(os.path.exists(MODEL_DIR + MODEL_SUFFIX))
         self.assertTrue(os.path.exists(MODEL_DIR + ".pdiparams"))
 
         expected = exe.run(
@@ -405,7 +415,7 @@ def test_save_and_load_inference_model(self):
             unsupported_param=None,
         )
         self.assertRaises(
-            (TypeError, ValueError),
+            (TypeError, RuntimeError, ValueError),
             paddle.static.load_inference_model,
             None,
             exe,
@@ -435,7 +445,7 @@ def test_save_and_load_inference_model(self):
         self.assertRaises(ValueError, paddle.static.io.save_to_file, '', 123)
         # test _get_valid_program
         self.assertRaises(TypeError, paddle.static.io._get_valid_program, 0)
-        p = Program()
+        p = paddle.static.Program()
         cp = CompiledProgram(p)
         paddle.static.io._get_valid_program(cp)
         self.assertTrue(paddle.static.io._get_valid_program(cp) is p)
@@ -491,12 +501,13 @@ def test_serialize_program_and_persistables(self):
             None,
         )
 
+    @test_with_pir_api
     def test_normalize_program(self):
-        init_program = base.default_startup_program()
-        program = base.default_main_program()
+        init_program = paddle.static.default_startup_program()
+        program = paddle.static.default_main_program()
 
         # fake program without feed/fetch
-        with program_guard(program, init_program):
+        with paddle.static.program_guard(program, init_program):
             x = paddle.static.data(name='x', shape=[-1, 2], dtype='float32')
             y = paddle.static.data(name='y', shape=[-1, 1], dtype='float32')
 
@@ -525,7 +536,7 @@ def test_normalize_program(self):
 
         # test if return type of serialize_program is bytes
         res = paddle.static.normalize_program(program, [x, y], [avg_cost])
-        self.assertTrue(isinstance(res, Program))
+        self.assertTrue(isinstance(res, paddle.static.Program))
         # test program type
         self.assertRaises(
             TypeError, paddle.static.normalize_program, None, [x, y], [avg_cost]
@@ -545,6 +556,7 @@ def test_normalize_program(self):
 
 
 class TestLoadInferenceModelError(unittest.TestCase):
+    @test_with_pir_api
     def test_load_model_not_exist(self):
         place = core.CPUPlace()
         exe = executor.Executor(place)
diff --git a/test/deprecated/legacy_test/test_inverse_op.py b/test/deprecated/legacy_test/test_inverse_op.py
index 22810eecee07d..54f8466bd4d02 100644
--- a/test/deprecated/legacy_test/test_inverse_op.py
+++ b/test/deprecated/legacy_test/test_inverse_op.py
@@ -35,6 +35,12 @@ def setUp(self):
 
         np.random.seed(123)
         mat = np.random.random(self.matrix_shape).astype(self.dtype)
+        if self.dtype == 'complex64' or self.dtype == 'complex128':
+            mat = (
+                np.random.random(self.matrix_shape)
+                + 1j * np.random.random(self.matrix_shape)
+            ).astype(self.dtype)
+
         inverse = np.linalg.inv(mat)
 
         self.inputs = {'Input': mat}
@@ -92,6 +98,26 @@ def config(self):
         self.python_api = paddle.tensor.math.inverse
 
 
+class TestInverseOpComplex64(TestInverseOp):
+    def config(self):
+        self.matrix_shape = [10, 10]
+        self.dtype = "complex64"
+        self.python_api = paddle.tensor.math.inverse
+
+    def test_grad(self):
+        self.check_grad(['Input'], 'Output', check_pir=True)
+
+
+class TestInverseOpComplex128(TestInverseOp):
+    def config(self):
+        self.matrix_shape = [10, 10]
+        self.dtype = "complex128"
+        self.python_api = paddle.tensor.math.inverse
+
+    def test_grad(self):
+        self.check_grad(['Input'], 'Output', check_pir=True)
+
+
 class TestInverseAPI(unittest.TestCase):
     def setUp(self):
         np.random.seed(123)
diff --git a/test/deprecated/legacy_test/test_multinomial_op.py b/test/deprecated/legacy_test/test_multinomial_op.py
index f6fc6e281193b..48c00ed5506e5 100644
--- a/test/deprecated/legacy_test/test_multinomial_op.py
+++ b/test/deprecated/legacy_test/test_multinomial_op.py
@@ -17,7 +17,7 @@
 
 import numpy as np
 from op_test import OpTest, convert_float_to_uint16
-from test_attribute_var import UnittestBase
+from test_attribute_var_deprecated import UnittestBase
 
 import paddle
 from paddle import base
diff --git a/test/deprecated/legacy_test/test_squared_l2_norm_op.py b/test/deprecated/legacy_test/test_squared_l2_norm_op.py
deleted file mode 100755
index df36c81097051..0000000000000
--- a/test/deprecated/legacy_test/test_squared_l2_norm_op.py
+++ /dev/null
@@ -1,148 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from numpy import linalg as LA
-from op_test import OpTest
-
-import paddle
-import paddle.distributed as dist
-from paddle import _C_ops, _legacy_C_ops
-from paddle.framework import in_dynamic_mode
-
-
-def test_squared_l2_norm(x):
-    if in_dynamic_mode():
-        return _C_ops.squared_l2_norm(x)
-    else:
-        return _legacy_C_ops.squared_l2_norm(x)
-
-
-class TestSquaredL2NormF16Op(unittest.TestCase):
-    def init_test_case(self):
-        X = np.random.uniform(-0.1, 0.1, (8, 5, 10)).astype('float32')
-        return X
-
-    def check_main(self, x_np, dtype):
-        paddle.disable_static()
-        x = paddle.to_tensor(x_np)
-
-        x.stop_gradient = False
-        y = test_squared_l2_norm(x)
-        x_g = paddle.grad(y, [x])
-
-        paddle.enable_static()
-        return y, x_g
-
-    def test_main(self):
-        x_np = self.init_test_case()
-        y_np_1, x_g_np_1 = self.check_main(x_np, 'float32')
-        y_np_2, x_g_np_2 = self.check_main(x_np, 'float16')
-
-        def assert_equal(x, y):
-            np.testing.assert_allclose(x, y, rtol=1e-05, atol=0.0)
-
-        assert_equal(y_np_1, y_np_2)
-        assert_equal(x_g_np_1, x_g_np_2)
-
-
-class TestSquaredL2NormF16Op1(TestSquaredL2NormF16Op):
-    def init_test_case(self):
-        X = np.random.uniform(-2.0, 2.0, (30, 10)).astype('float32')
-        return X
-
-
-class TestSquaredL2NormF16Op2(TestSquaredL2NormF16Op):
-    def init_test_case(self):
-        X = np.random.uniform(-5.0, 5.0, (20, 10, 20)).astype('float32')
-        return X
-
-
-class TestL2LossOp(OpTest):
-    """Test squared_l2_norm"""
-
-    def config(self):
-        self.x_shape = (13, 19)
-        self.check_auto_parallel = False
-
-    def setUp(self):
-        self.config()
-        self.python_api = test_squared_l2_norm
-        self.op_type = "squared_l2_norm"
-        self.max_relative_error = 0.05
-
-        X = np.random.uniform(-1, 1, self.x_shape).astype("float32")
-        X[np.abs(X) < self.max_relative_error] = 0.1
-        self.inputs = {'X': X}
-        self.outputs = {'Out': np.array([np.square(LA.norm(X))])}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(
-            ['X'],
-            'Out',
-            max_relative_error=self.max_relative_error,
-            check_auto_parallel=self.check_auto_parallel,
-        )
-
-
-class TestSquaredL2NormAutoParallel_1(TestL2LossOp):
-    def config(self):
-        self.x_shape = (14, 18)
-        self.check_auto_parallel = True
-        self.placements = {
-            'X': [dist.Replicate()],
-        }
-
-
-class TestSquaredL2NormAutoParallel_2(TestL2LossOp):
-    def config(self):
-        self.x_shape = (14, 18)
-        self.check_auto_parallel = True
-        self.placements = {
-            'X': [dist.Shard(0)],
-        }
-
-
-class TestSquaredL2NormAutoParallel_3(TestL2LossOp):
-    def config(self):
-        self.x_shape = (14, 18)
-        self.check_auto_parallel = True
-        self.placements = {
-            'X': [dist.Shard(1)],
-        }
-
-
-class TestL2LossDeterministic(unittest.TestCase):
-    def check_place(self, place):
-        with paddle.base.dygraph.guard(place):
-            x_np = np.random.rand(5, 11, 13).astype('float32')
-            x = paddle.to_tensor(x_np)
-            y1 = _legacy_C_ops.squared_l2_norm(x)
-            y2 = _legacy_C_ops.squared_l2_norm(x)
-            np.testing.assert_array_equal(y1.numpy(), y2.numpy())
-
-    def test_main(self):
-        self.check_place(paddle.CPUPlace())
-        if paddle.is_compiled_with_cuda():
-            self.check_place(paddle.CUDAPlace(0))
-
-
-if __name__ == "__main__":
-    paddle.enable_static()
-    unittest.main()
diff --git a/test/deprecated/legacy_test/test_squeeze2_op_rename.py b/test/deprecated/legacy_test/test_squeeze2_op_rename.py
index ed347eda7350b..02e63c0cb2459 100644
--- a/test/deprecated/legacy_test/test_squeeze2_op_rename.py
+++ b/test/deprecated/legacy_test/test_squeeze2_op_rename.py
@@ -15,7 +15,7 @@
 import os
 import unittest
 
-from test_attribute_var import UnittestBase
+from test_attribute_var_deprecated import UnittestBase
 
 import paddle
 from paddle.base.framework import Program, program_guard
diff --git a/test/distributed_passes/test_fuse_allreduce_split_to_reducescatter_pass.py b/test/distributed_passes/test_fuse_allreduce_split_to_reducescatter_pass.py
index b36a5121d2e82..5127589c36396 100644
--- a/test/distributed_passes/test_fuse_allreduce_split_to_reducescatter_pass.py
+++ b/test/distributed_passes/test_fuse_allreduce_split_to_reducescatter_pass.py
@@ -22,7 +22,7 @@
     (%38) = "pd_op.data" () {dtype:(pd_op.DataType)bfloat16,name:"linear_0.tmp_0",persistable:[false],place:(pd_op.Place)Place(gpu:0),shape:(pd_op.IntArray)[4096,1,28672],stop_gradient:[false]} : () -> builtin.tensor<4096x1x28672xbf16>
     (%48) = "pd_op.data" () {dtype:(pd_op.DataType)bfloat16,name:"input",persistable:[false],place:(pd_op.Place)Place(gpu:0),shape:(pd_op.IntArray)[4096,1,28672],stop_gradient:[false]} : () -> builtin.tensor<4096x1x28672xbf16>
     (%50) = "pd_op.matmul" (%48, %2) {persistable:[false],stop_gradient:[false],transpose_x:false,transpose_y:true} : (builtin.tensor<4096x1x28672xbf16>, builtin.tensor<8192x28672xbf16>) -> builtin.tensor<4096x1x8192xbf16>
-    (%57) = "pd_op.c_allreduce_sum_" (%50) {persistable:[false],ring_id:(Int32)36,stop_gradient:[false],use_calc_stream:true,use_model_parallel:true} : (builtin.tensor<4096x1x8192xbf16>) -> builtin.tensor<4096x1x8192xbf16>
+    (%57) = "pd_op.c_allreduce_sum_" (%50) {event_to_record:"event_7989",events_to_wait:[],execution_stream:"auto_parallel_mp",force_record_event:false,persistable:[false],ring_id:(Int32)36,stop_gradient:[false],use_calc_stream:true,use_model_parallel:true} : (builtin.tensor<4096x1x8192xbf16>) -> builtin.tensor<4096x1x8192xbf16>
     (%63) = "pd_op.assign" (%57) {persistable:[false],stop_gradient:[false]} : (builtin.tensor<4096x1x8192xbf16>) -> builtin.tensor<4096x1x8192xbf16>
     (%64) = "pd_op.full" () {dtype:(pd_op.DataType)int32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)0} : () -> builtin.tensor<1xi32>
     (%65) = "pd_op.split_with_num" (%63, %64) {num:(Int32)2,persistable:[false],stop_gradient:[false]} : (builtin.tensor<4096x1x8192xbf16>, builtin.tensor<1xi32>) -> vec[builtin.tensor<2048x1x8192xbf16>,builtin.tensor<2048x1x8192xbf16>]
diff --git a/test/distribution/test_distribution_student_t.py b/test/distribution/test_distribution_student_t.py
new file mode 100644
index 0000000000000..900e47cea2428
--- /dev/null
+++ b/test/distribution/test_distribution_student_t.py
@@ -0,0 +1,274 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import parameterize
+import scipy.stats
+from distribution import config
+from parameterize import (
+    TEST_CASE_NAME,
+    parameterize_cls,
+    parameterize_func,
+)
+
+import paddle
+from paddle.distribution.student_t import StudentT
+
+
+@parameterize.place(config.DEVICES)
+@parameterize.parameterize_cls(
+    (parameterize.TEST_CASE_NAME, 'df', 'loc', 'scale'),
+    [
+        (
+            'one-dim',
+            10.0,
+            1.0,
+            2.0,
+        ),
+        (
+            'multi-dim',
+            parameterize.xrand((2, 1), dtype='float32', min=4, max=30),
+            parameterize.xrand((2, 3), dtype='float32', min=1, max=10),
+            parameterize.xrand((2, 3), dtype='float32', min=0.1, max=3),
+        ),
+        (
+            'multi-dim2',
+            parameterize.xrand((2, 1), dtype='float64', min=4, max=30),
+            parameterize.xrand((2, 3), dtype='float64', min=-10, max=-1),
+            parameterize.xrand((2, 3), dtype='float64', min=0.1, max=3),
+        ),
+    ],
+)
+class TestStudentT(unittest.TestCase):
+    def setUp(self):
+        df = (
+            self.df if isinstance(self.df, float) else paddle.to_tensor(self.df)
+        )
+        loc = (
+            self.loc
+            if isinstance(self.loc, float)
+            else paddle.to_tensor(self.loc)
+        )
+        scale = (
+            self.scale
+            if isinstance(self.scale, float)
+            else paddle.to_tensor(self.scale)
+        )
+        self._dist = StudentT(df, loc, scale)
+
+    def test_mean(self):
+        mean = self._dist.mean
+        target_dtype = (
+            "float32" if isinstance(self.df, float) else self.df.dtype
+        )
+        self.assertEqual(mean.numpy().dtype, target_dtype)
+        np.testing.assert_allclose(
+            mean,
+            self._np_mean(),
+            rtol=config.RTOL.get(str(target_dtype)),
+            atol=config.ATOL.get(str(target_dtype)),
+        )
+
+    def test_variance(self):
+        var = self._dist.variance
+        target_dtype = (
+            "float32" if isinstance(self.df, float) else self.df.dtype
+        )
+        self.assertEqual(var.numpy().dtype, target_dtype)
+        np.testing.assert_allclose(
+            var,
+            self._np_variance(),
+            rtol=config.RTOL.get(str(target_dtype)),
+            atol=config.ATOL.get(str(target_dtype)),
+        )
+
+    def test_entropy(self):
+        entropy = self._dist.entropy()
+        target_dtype = (
+            "float32" if isinstance(self.df, float) else self.df.dtype
+        )
+        self.assertEqual(entropy.numpy().dtype, target_dtype)
+        np.testing.assert_allclose(
+            entropy,
+            self._np_entropy(),
+            rtol=config.RTOL.get(str(target_dtype)),
+            atol=config.ATOL.get(str(target_dtype)),
+        )
+
+    def test_sample(self):
+        sample_shape = ()
+        samples = self._dist.sample(sample_shape)
+        self.assertEqual(
+            tuple(samples.shape),
+            sample_shape + self._dist.batch_shape + self._dist.event_shape,
+        )
+
+        sample_shape = (10000,)
+        samples = self._dist.sample(sample_shape)
+        sample_mean = samples.mean(axis=0)
+        sample_variance = samples.var(axis=0)
+
+        # Tolerance value 0.1 is empirical value which is consistent with
+        # TensorFlow
+        np.testing.assert_allclose(
+            sample_mean, self._dist.mean, atol=0, rtol=0.10
+        )
+        # Tolerance value 0.1 is empirical value which is consistent with
+        # TensorFlow
+        np.testing.assert_allclose(
+            sample_variance, self._dist.variance, atol=0, rtol=0.10
+        )
+
+    def _np_variance(self):
+        if isinstance(self.df, np.ndarray) and self.df.dtype == np.float32:
+            df = self.df.astype("float64")
+        else:
+            df = self.df
+        if isinstance(self.loc, np.ndarray) and self.loc.dtype == np.float32:
+            loc = self.loc.astype("float64")
+        else:
+            loc = self.loc
+        if (
+            isinstance(self.scale, np.ndarray)
+            and self.scale.dtype == np.float32
+        ):
+            scale = self.scale.astype("float64")
+        else:
+            scale = self.scale
+        return scipy.stats.t.var(df, loc, scale)
+
+    def _np_mean(self):
+        if isinstance(self.df, np.ndarray) and self.df.dtype == np.float32:
+            df = self.df.astype("float64")
+        else:
+            df = self.df
+        if isinstance(self.loc, np.ndarray) and self.loc.dtype == np.float32:
+            loc = self.loc.astype("float64")
+        else:
+            loc = self.loc
+        if (
+            isinstance(self.scale, np.ndarray)
+            and self.scale.dtype == np.float32
+        ):
+            scale = self.scale.astype("float64")
+        else:
+            scale = self.scale
+        return scipy.stats.t.mean(df, loc, scale)
+
+    def _np_entropy(self):
+        if isinstance(self.df, np.ndarray) and self.df.dtype == np.float32:
+            df = self.df.astype("float64")
+        else:
+            df = self.df
+        if isinstance(self.loc, np.ndarray) and self.loc.dtype == np.float32:
+            loc = self.loc.astype("float64")
+        else:
+            loc = self.loc
+        if (
+            isinstance(self.scale, np.ndarray)
+            and self.scale.dtype == np.float32
+        ):
+            scale = self.scale.astype("float64")
+        else:
+            scale = self.scale
+        return scipy.stats.t.entropy(df, loc, scale)
+
+
+@parameterize.place(config.DEVICES)
+@parameterize.parameterize_cls(
+    (parameterize.TEST_CASE_NAME, 'df', 'loc', 'scale', 'value'),
+    [
+        (
+            'one-dim',
+            10.0,
+            0.0,
+            1.0,
+            np.array(3.3).astype("float32"),
+        ),
+        (
+            'value-broadcast-shape',
+            parameterize.xrand((2, 1), dtype='float64', min=4, max=30),
+            parameterize.xrand((2, 1), dtype='float64', min=-10, max=10),
+            parameterize.xrand((2, 1), dtype='float64', min=0.1, max=5),
+            parameterize.xrand((2, 4), dtype='float64', min=-10, max=10),
+        ),
+    ],
+)
+class TestStudentTProbs(unittest.TestCase):
+    def setUp(self):
+        df = (
+            self.df if isinstance(self.df, float) else paddle.to_tensor(self.df)
+        )
+        loc = (
+            self.loc
+            if isinstance(self.loc, float)
+            else paddle.to_tensor(self.loc)
+        )
+        scale = (
+            self.scale
+            if isinstance(self.scale, float)
+            else paddle.to_tensor(self.scale)
+        )
+        self._dist = StudentT(df, loc, scale)
+
+    def test_prob(self):
+        target_dtype = (
+            "float32" if isinstance(self.df, float) else self.df.dtype
+        )
+        np.testing.assert_allclose(
+            self._dist.prob(paddle.to_tensor(self.value)),
+            scipy.stats.t.pdf(self.value, self.df, self.loc, self.scale),
+            rtol=config.RTOL.get(str(target_dtype)),
+            atol=config.ATOL.get(str(target_dtype)),
+        )
+
+    def test_log_prob(self):
+        target_dtype = (
+            "float32" if isinstance(self.df, float) else self.df.dtype
+        )
+        np.testing.assert_allclose(
+            self._dist.log_prob(paddle.to_tensor(self.value)),
+            scipy.stats.t.logpdf(self.value, self.df, self.loc, self.scale),
+            rtol=config.RTOL.get(str(target_dtype)),
+            atol=config.ATOL.get(str(target_dtype)),
+        )
+
+
+@parameterize.place(config.DEVICES)
+@parameterize_cls([TEST_CASE_NAME], ['StudentTTestError'])
+class StudentTTestError(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static(self.place)
+
+    @parameterize_func(
+        [
+            (-5.0, 0.0, 1.0, ValueError),  # negative df
+            (5.0, 0.0, -1.0, ValueError),  # negative scale
+        ]
+    )
+    def test_bad_parameter(self, df, loc, scale, error):
+        with paddle.base.dygraph.guard(self.place):
+            self.assertRaises(error, StudentT, df, loc, scale)
+
+    @parameterize_func([(10,)])  # not sequence object sample shape
+    def test_bad_sample_shape(self, shape):
+        with paddle.base.dygraph.guard(self.place):
+            t = StudentT(5.0, 0.0, 1.0)
+            self.assertRaises(TypeError, t.sample, shape)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/dygraph_to_static/test_typehint.py b/test/dygraph_to_static/test_typehint.py
index fd4dbacc6ad6d..b84ce4f332a91 100644
--- a/test/dygraph_to_static/test_typehint.py
+++ b/test/dygraph_to_static/test_typehint.py
@@ -35,15 +35,15 @@ def function(x: A) -> A:
 
 def fn_annotation_assign_with_value(x: paddle.Tensor):
     if x:
-        y: List["paddle.Tensor"] = [x + 1]
+        y: List[paddle.Tensor] = [x + 1]
     else:
-        y: List["paddle.Tensor"] = [x - 1]
+        y: List[paddle.Tensor] = [x - 1]
     return y
 
 
 def fn_annotation_assign_without_value(x: paddle.Tensor):
     if x:
-        y: List["paddle.Tensor"]
+        y: List[paddle.Tensor]
         y = [x + 1]
     else:
         y = [x - 1]
diff --git a/test/ir/inference/quant_dequant_test.py b/test/ir/inference/quant_dequant_test.py
index c176e802a525c..85724a2cc7df2 100644
--- a/test/ir/inference/quant_dequant_test.py
+++ b/test/ir/inference/quant_dequant_test.py
@@ -22,9 +22,10 @@
 
 import paddle
 from paddle import base
-from paddle.base import Program, Variable, core
+from paddle.base import core
 from paddle.base.core import AnalysisConfig, create_paddle_predictor
 from paddle.base.framework import IrGraph
+from paddle.static import Variable
 from paddle.static.io import append_fetch_ops, prepend_feed_ops
 from paddle.static.quantization import (
     AddQuantDequantPass,
@@ -39,10 +40,10 @@ class QuantDequantTest(unittest.TestCase):
     def __init__(self, methodName='runTest'):
         super().__init__(methodName)
         paddle.enable_static()
-        self.main_program = base.Program()
-        self.startup_program = base.Program()
-        self.test_main_program = base.Program()
-        self.test_startup_program = base.Program()
+        self.main_program = paddle.static.Program()
+        self.startup_program = paddle.static.Program()
+        self.test_main_program = paddle.static.Program()
+        self.test_startup_program = paddle.static.Program()
         self.feeds = None
         self.fetch_list = None
         self.enable_mkldnn = False
@@ -62,10 +63,9 @@ def __init__(self, methodName='runTest'):
 
     # from Paddle release2.1
     def _normalize_program(self, program, feed_vars, fetch_vars):
-        if not isinstance(program, Program):
+        if not isinstance(program, paddle.static.Program):
             raise TypeError(
-                "program type must be `base.Program`, but received `%s`"
-                % type(program)
+                f"program type must be `paddle.static.Program`, but received `{type(program)}`"
             )
         if not isinstance(feed_vars, list):
             feed_vars = [feed_vars]
@@ -127,7 +127,7 @@ def _save_models(
             if var.name in feeded_var_names:
                 feeded_vars.append(var)
 
-        with base.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             paddle.static.io.save_inference_model(
                 dirname,
                 feeded_vars,
@@ -155,7 +155,7 @@ def _get_paddle_outs(self, feed, fetch_list, executor, program, scope):
         '''
         Return PaddlePaddle outputs.
         '''
-        with base.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             outs = executor.run(
                 program=program,
                 feed=feed,
@@ -245,12 +245,12 @@ def check_output_with_option(
         or disable TensorRT, enable MKLDNN or disable MKLDNN
         are all the same.
         '''
-        place = base.CUDAPlace(0) if use_gpu else base.CPUPlace()
-        executor = base.Executor(place)
-        scope = base.Scope()
+        place = paddle.CUDAPlace(0) if use_gpu else paddle.CPUPlace()
+        executor = paddle.static.Executor(place)
+        scope = paddle.static.Scope()
         device = "GPU" if use_gpu else "CPU"
 
-        with base.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             executor.run(self.startup_program)
             executor.run(self.test_startup_program)
         main_graph = IrGraph(core.Graph(self.main_program.desc), for_test=False)
@@ -274,11 +274,11 @@ def check_output_with_option(
         scale_training_pass = OutScaleForTrainingPass(scope=scope, place=place)
         scale_training_pass.apply(main_graph)
 
-        build_strategy = base.BuildStrategy()
+        build_strategy = paddle.static.BuildStrategy()
         build_strategy.memory_optimize = False
         build_strategy.enable_inplace = False
         build_strategy.fuse_all_reduce_ops = False
-        binary = base.CompiledProgram(main_graph.graph)
+        binary = paddle.static.CompiledProgram(main_graph.graph)
 
         iters = 10
         batch_size = 1
@@ -287,7 +287,7 @@ def check_output_with_option(
             batch_size=batch_size,
         )
         feeder = base.DataFeeder(feed_list=[self.data, self.label], place=place)
-        with base.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             for _ in range(iters):
                 data = next(train_reader())
                 loss_v = executor.run(
@@ -307,7 +307,7 @@ def check_output_with_option(
 
         self.main_program = test_graph.to_program()
 
-        with base.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             self.main_program = self._normalize_program(
                 self.main_program, self.data, self.fetch_list
             )
@@ -450,6 +450,6 @@ def __init__(
             self.disable_trt_plugin_fp16 = disable_trt_plugin_fp16
 
     def quant_dequant(self):
-        place = base.CPUPlace()
-        exe = base.Executor(place)
-        scope = base.Scope()
+        place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        scope = paddle.static.Scope()
diff --git a/test/ir/pir/cinn/sub_graphs/base.py b/test/ir/pir/cinn/sub_graphs/base.py
index a11ffe4f9e1bd..a0ceee03095db 100644
--- a/test/ir/pir/cinn/sub_graphs/base.py
+++ b/test/ir/pir/cinn/sub_graphs/base.py
@@ -30,7 +30,7 @@ def setUp(self):
         self.atol = 1e-6
         self.train_atol = 1e-6
         self.with_precision_compare = True
-        self.with_train = False  # 本个pr中默认为false，下个增量pr中改为默认true
+        self.with_train = True  # 本个pr中默认为false，下个增量pr中改为默认true
         # override customized settting
         self.init()
         if self.inputs:
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_0.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_0.py
index e5d86d0e40f53..228465812c587 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_0.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_0.py
@@ -135,7 +135,6 @@ def init(self):
             paddle.rand(shape=[22, 512, 7, 7], dtype=paddle.float32),
         )
         self.net = LayerCase
-        self.with_train = True
 
     def set_flags(self):
         # NOTE(Aurelius84): cinn_op.pool2d only support pool_type='avg' under adaptive=True
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_1.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_1.py
index 10ed97211646c..d40e635bca9ed 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_1.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_1.py
@@ -62,6 +62,7 @@ def init(self):
             paddle.rand(shape=[10, 512, 7, 7], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_10.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_10.py
index c151d478a6ac6..b871017d1e038 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_10.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_10.py
@@ -75,6 +75,7 @@ def init(self):
             paddle.rand(shape=[10, 36, 28, 28], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_11.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_11.py
index 464ab6166a0fa..83fd4bff996bc 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_11.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_11.py
@@ -65,6 +65,7 @@ def init(self):
             paddle.rand(shape=[10, 1280, 1, 1], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_13.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_13.py
index 24d79ccfc8e94..dd91f88558b59 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_13.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_13.py
@@ -60,6 +60,7 @@ def init(self):
             paddle.rand(shape=[10, 2048, 7, 7], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_14.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_14.py
index 167b10dd6df2f..7708b6fb6c2bb 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_14.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_14.py
@@ -72,6 +72,7 @@ def init(self):
             paddle.rand(shape=[22, 128, 56, 56], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_15.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_15.py
index c5050e5cb9d55..4d1ac693615d3 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_15.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_15.py
@@ -72,6 +72,7 @@ def init(self):
             paddle.rand(shape=[10, 122, 28, 28], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_16.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_16.py
index 5fad58c5de16b..3e6696a5f23c9 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_16.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_16.py
@@ -115,6 +115,7 @@ def init(self):
             paddle.rand(shape=[22, 28, 56, 56], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
     def set_flags(self):
         # NOTE(Aurelius84): cinn_op.pool2d only support pool_type='avg' under adaptive=True
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_17.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_17.py
index 5dc0d861cc847..62ef8a2dbe38c 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_17.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_17.py
@@ -60,6 +60,7 @@ def init(self):
             paddle.rand(shape=[22, 2048, 7, 7], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_18.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_18.py
index b4010043304be..e8f4772b757a5 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_18.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_18.py
@@ -68,6 +68,7 @@ def init(self):
             paddle.rand(shape=[22, 1536, 8, 8], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
         self.with_precision_compare = False
 
     # NOTE output mismatch with prim
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_2.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_2.py
index d3faccc973b03..883067279e417 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_2.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_2.py
@@ -74,7 +74,6 @@ def init(self):
             paddle.rand(shape=[43, 256, 56, 56], dtype=paddle.float32),
         )
         self.net = LayerCase
-        self.with_train = True
 
     def set_flags(self):
         # NOTE(Aurelius84): cinn_op.pool2d only support pool_type='avg' under adaptive=True
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_20.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_20.py
index 57dcec3e56353..82523d9dd29e4 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_20.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_20.py
@@ -77,6 +77,7 @@ def init(self):
             paddle.rand(shape=[86, 192], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_21.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_21.py
index 49eea1bd4cbfd..b19151557a65a 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_21.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_21.py
@@ -108,6 +108,7 @@ def init(self):
         ]
         self.inputs = (paddle.rand(shape=[86, 198, 192], dtype=paddle.float32),)
         self.net = LayerCase
+        self.with_train = False
 
     # NOTE output mismatch with prim
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_23.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_23.py
index 83ddc2b51b2b8..b37c912b61f5d 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_23.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_23.py
@@ -60,6 +60,7 @@ def init(self):
             paddle.rand(shape=[11, 24, 56, 56], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_25.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_25.py
index b434f440365f6..d6be0ea181c59 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_25.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_25.py
@@ -68,6 +68,7 @@ def init(self):
             paddle.rand(shape=[11, 1280, 7, 7], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
         self.with_precision_compare = False
 
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_28.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_28.py
index 6a25c112a0b47..5387f9ee37177 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_28.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_28.py
@@ -68,6 +68,7 @@ def init(self):
             paddle.rand(shape=[10, 320, 8, 8], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
         self.with_precision_compare = False
 
     # NOTE prim + cinn lead to error
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_29.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_29.py
index 85b2207fd1ee1..9283f453e46ae 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_29.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_29.py
@@ -68,6 +68,7 @@ def init(self):
             paddle.rand(shape=[10, 2048, 10, 10], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
         self.with_precision_compare = False
 
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_3.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_3.py
index 23b9ec755c7be..9c538dea0d694 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_3.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_3.py
@@ -89,6 +89,7 @@ def init(self):
             paddle.randint(low=0, high=10, shape=[16, 49], dtype=paddle.int64),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_31.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_31.py
index 81d18df09b741..eee47cf931cd9 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_31.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_31.py
@@ -66,6 +66,7 @@ def init(self):
             paddle.rand(shape=[22, 288, 14, 14], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
         self.atol = 1e-8
 
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_32.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_32.py
index 7586bd7c8cd37..2bed2bfc9a742 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_32.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_32.py
@@ -54,6 +54,7 @@ def init(self):
             paddle.rand(shape=[22, 1024, 1, 1], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_33.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_33.py
index 0d50f420cdc22..55b168f5e2ade 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_33.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_33.py
@@ -84,6 +84,7 @@ def init(self):
             paddle.rand(shape=[10, 256, 14, 14], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
         self.atol = 1e-5
 
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_34.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_34.py
index 7466135585abd..a8d09423a95eb 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_34.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_34.py
@@ -57,6 +57,7 @@ def init(self):
             paddle.rand(shape=[10, 32, 56, 56], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
         self.atol = 1e-8
 
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_35.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_35.py
index 7eb05d010bd2f..8c70aa1f75ae2 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_35.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_35.py
@@ -84,6 +84,7 @@ def init(self):
             paddle.rand(shape=[4, 3, 384, 384], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_36.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_36.py
index 03f141b241bdc..6abd8655d98f6 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_36.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_36.py
@@ -70,6 +70,7 @@ def init(self):
         ]
         self.inputs = (paddle.rand(shape=[6, 9216, 96], dtype=paddle.float32),)
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_38.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_38.py
index 431650d6bdbef..828f15fa32c3b 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_38.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_38.py
@@ -48,6 +48,7 @@ def init(self):
             paddle.rand(shape=[4, 48, 96, 96], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_39.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_39.py
index ddd3cdf8c3eda..44431cb437d82 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_39.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_39.py
@@ -46,6 +46,7 @@ def init(self):
         ]
         self.inputs = (paddle.rand(shape=[12, 288, 192], dtype=paddle.float32),)
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_4.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_4.py
index 9d419dbb38959..f03c8322cce70 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_4.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_4.py
@@ -51,6 +51,7 @@ def init(self):
         ]
         self.inputs = (paddle.rand(shape=[22, 196, 128], dtype=paddle.float32),)
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_41.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_41.py
index 352f81b791d41..d3d09e75e4f70 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_41.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_41.py
@@ -66,6 +66,7 @@ def init(self):
             paddle.randint(low=0, high=10, shape=[2], dtype=paddle.int32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_42.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_42.py
index 0e8a6574081a4..60d3846377987 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_42.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_42.py
@@ -114,6 +114,7 @@ def init(self):
             paddle.rand(shape=[2, 4], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
     def set_flags(self):
         # NOTE(Aurelius84): cinn_op.pool2d only support pool_type='avg' under adaptive=True
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_43.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_43.py
index 0104a18d75d60..9440b6cb9dbd5 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_43.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_43.py
@@ -258,6 +258,7 @@ def init(self):
             paddle.rand(shape=[1, 2048, 24, 36], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
         self.atol = 1e-5
 
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_44.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_44.py
index 06c021953fd1e..34416aea9ae97 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_44.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_44.py
@@ -143,6 +143,7 @@ def init(self):
             paddle.rand(shape=[1, 100, 256], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
         self.atol = 1e-8
         self.with_cinn = False
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_45.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_45.py
index 8c9802242f436..d2f6befdc9147 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_45.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_45.py
@@ -70,6 +70,7 @@ def init(self):
         ]
         self.inputs = (paddle.rand(shape=[1, 4], dtype=paddle.float32),)
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_46.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_46.py
index 6e45b88c332da..19ec352bcf5d4 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_46.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_46.py
@@ -62,6 +62,7 @@ def init(self):
             paddle.rand(shape=[1, 80, 50, 50], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
     # NOTE prim + cinn lead to error
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_47.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_47.py
index 72599e85f742f..5096d5f366b63 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_47.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_47.py
@@ -47,6 +47,7 @@ def init(self):
             paddle.randint(low=0, high=10, shape=[2], dtype=paddle.int64),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_48.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_48.py
index eaa9d3e6b9232..7fc4b64f1466f 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_48.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_48.py
@@ -190,6 +190,7 @@ def init(self):
             paddle.rand(shape=[1, 625, 1], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
         self.atol = 1e-5
 
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_49.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_49.py
index 34ecd19552529..4367e45015b23 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_49.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_49.py
@@ -66,6 +66,7 @@ def init(self):
             paddle.rand(shape=[1], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_5.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_5.py
index 7c9639d906cda..181d06fffb4c3 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_5.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_5.py
@@ -46,6 +46,7 @@ def init(self):
         ]
         self.inputs = (paddle.rand(shape=[22, 16, 384], dtype=paddle.float32),)
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_51.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_51.py
index 10ab5da982012..152dc5b2ce483 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_51.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_51.py
@@ -90,6 +90,7 @@ def init(self):
             paddle.rand(shape=[1, 4, 64, 64], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
     # NOTE prim + cinn lead to error
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_52.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_52.py
index ed08605e070d1..e1a3774b1be35 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_52.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_52.py
@@ -94,6 +94,7 @@ def init(self):
             paddle.rand(shape=[91], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_54.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_54.py
index cf04f914d15a9..7bdef30c7d243 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_54.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_54.py
@@ -117,6 +117,7 @@ def init(self):
             paddle.rand(shape=[1, 96, 128, 128], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_55.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_55.py
index 7d065da0bc99b..9a623a7afa130 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_55.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_55.py
@@ -78,6 +78,7 @@ def init(self):
             paddle.rand(shape=[1, 192, 32, 32], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
     # NOTE prim + cinn lead to error
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_56.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_56.py
index 79d9a9c15cf9e..4646923191e60 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_56.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_56.py
@@ -74,6 +74,7 @@ def init(self):
             paddle.rand(shape=[24], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
     # NOTE prim + cinn lead to error
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_57.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_57.py
index a34e30dc687e2..d297a19fa0932 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_57.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_57.py
@@ -42,6 +42,7 @@ def init(self):
         self.input_specs = []
         self.inputs = ()
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_59.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_59.py
index 12dc85dbf3d3f..072c8077b7295 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_59.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_59.py
@@ -95,6 +95,7 @@ def init(self):
             paddle.rand(shape=[1, 44, 32, 32], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
     # NOTE prim + cinn lead to error
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_6.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_6.py
index f51b3a846151d..89a1c19ed53a7 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_6.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_6.py
@@ -47,6 +47,7 @@ def init(self):
         ]
         self.inputs = (paddle.rand(shape=[10, 196, 640], dtype=paddle.float32),)
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_61.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_61.py
index 21332c862ab22..41be02a221bd4 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_61.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_61.py
@@ -91,6 +91,7 @@ def init(self):
             paddle.rand(shape=[1, 4], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
         self.atol = 1e-5
 
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_62.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_62.py
index d4a2234509d1c..dd6069d9f9555 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_62.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_62.py
@@ -71,6 +71,7 @@ def init(self):
             paddle.rand(shape=[1], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
         self.atol = 1e-8
 
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_63.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_63.py
index 5456431c96fea..6a6f430bd82be 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_63.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_63.py
@@ -96,6 +96,7 @@ def init(self):
             paddle.rand(shape=[171888, 4], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
     def set_flags(self):
         # NOTE(Aurelius84): cinn_op.pool2d only support pool_type='avg' under adaptive=True
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_64.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_64.py
index 9ec76729c00e0..820f7af48178e 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_64.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_64.py
@@ -72,6 +72,7 @@ def init(self):
             paddle.rand(shape=[512, 256, 7, 7], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_65.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_65.py
index 18af525df5c4c..e7e636628d5f1 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_65.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_65.py
@@ -55,6 +55,7 @@ def init(self):
         ]
         self.inputs = (paddle.rand(shape=[2, 2002], dtype=paddle.float32),)
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_66.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_66.py
index 1c3d72c455056..033202891b2ed 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_66.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_66.py
@@ -64,6 +64,7 @@ def init(self):
         ]
         self.inputs = (paddle.rand(shape=[2, 1788], dtype=paddle.float32),)
         self.net = LayerCase
+        self.with_train = False
 
     # NOTE prim + cinn lead to error
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_67.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_67.py
index 75fb8ca7cfb38..74513aac91b5b 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_67.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_67.py
@@ -134,6 +134,7 @@ def init(self):
             paddle.randint(low=0, high=10, shape=[1], dtype=paddle.int32),
         )
         self.net = LayerCase
+        self.with_train = False
         self.with_cinn = False
 
     # NOTE prim + cinn lead to error
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_68.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_68.py
index d3571d898798f..67df4b8fba497 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_68.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_68.py
@@ -206,6 +206,7 @@ def init(self):
             paddle.rand(shape=[528, 4], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_69.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_69.py
index c1c4b94929310..4e64e3aea0bbc 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_69.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_69.py
@@ -65,6 +65,7 @@ def init(self):
             paddle.rand(shape=[1, 171888, 4], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
         self.with_precision_compare = False
 
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_7.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_7.py
index f4236d7664c59..bdc2d7b052c77 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_7.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_7.py
@@ -91,6 +91,7 @@ def init(self):
             paddle.randint(low=0, high=10, shape=[49, 49], dtype=paddle.int64),
         )
         self.net = LayerCase
+        self.with_train = False
         self.with_cinn = False
 
     # NOTE prim + cinn lead to error
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_70.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_70.py
index 30b04988e601f..a483c47e1e05f 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_70.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_70.py
@@ -61,6 +61,7 @@ def init(self):
             paddle.randint(low=0, high=10, shape=[2], dtype=paddle.int32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_72.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_72.py
index ff048a21337da..489eab05cf04e 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_72.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_72.py
@@ -143,6 +143,7 @@ def init(self):
         self.input_specs = []
         self.inputs = ()
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_73.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_73.py
index ea4a9cd49726d..a75d51a21cd1e 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_73.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_73.py
@@ -98,6 +98,7 @@ def init(self):
             paddle.rand(shape=[2], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
         self.with_cinn = False
 
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_74.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_74.py
index a069b9bc3874b..03fcab9ff9f00 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_74.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_74.py
@@ -75,6 +75,7 @@ def init(self):
             paddle.rand(shape=[2], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
         self.with_precision_compare = False
 
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_75.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_75.py
index 41204b7c15d2e..a20fbaf33e4e7 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_75.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_75.py
@@ -96,6 +96,7 @@ def init(self):
             paddle.rand(shape=[1, 3, 544, 736], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_77.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_77.py
index bb22fb38c693a..4ad52c6aa976c 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_77.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_77.py
@@ -209,6 +209,7 @@ def init(self):
             paddle.rand(shape=[1], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_78.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_78.py
index af4320f4609ef..f987f5a334ca6 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_78.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_78.py
@@ -125,6 +125,7 @@ def init(self):
             paddle.rand(shape=[1, 256, 13, 19], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_79.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_79.py
index 96d9de9b9c2b6..1bf2af665a2e2 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_79.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_79.py
@@ -134,6 +134,7 @@ def init(self):
             paddle.rand(shape=[1, 3, 96, 96, 1], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_8.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_8.py
index 6340bf5a4d451..656e522137b4b 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_8.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_8.py
@@ -47,6 +47,7 @@ def init(self):
             paddle.rand(shape=[22, 128, 14, 14], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_80.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_80.py
index 2fe8b3f007e86..4a34d06b5b4af 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_80.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_80.py
@@ -125,6 +125,7 @@ def init(self):
             paddle.rand(shape=[1, 3, 48, 48, 1], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_81.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_81.py
index dc0d1e5126259..acbe1eae0ae60 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_81.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_81.py
@@ -80,6 +80,7 @@ def init(self):
             paddle.rand(shape=[1, 80, 44, 44], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
     # NOTE prim + cinn lead to error
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_82.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_82.py
index 65ab9b68b7b6d..9761629a802e3 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_82.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_82.py
@@ -173,6 +173,7 @@ def init(self):
             paddle.rand(shape=[2541, 2], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
         self.with_cinn = False
 
     # NOTE cinn lead to error
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_83.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_83.py
index 2a1a527317b91..889e5b0e9dfde 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_83.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_83.py
@@ -96,6 +96,7 @@ def init(self):
         self.input_specs = []
         self.inputs = ()
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_84.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_84.py
index 595163ad073e1..a20bac9133a8f 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_84.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_84.py
@@ -81,6 +81,7 @@ def init(self):
             paddle.rand(shape=[1, 2541, 68], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_85.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_85.py
index 9ef4bf92bc473..80137072f1c23 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_85.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_85.py
@@ -61,6 +61,7 @@ def init(self):
             paddle.rand(shape=[16384, 5], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_86.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_86.py
index 698760309d8ff..47221f58d3ca3 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_86.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_86.py
@@ -247,6 +247,7 @@ def init(self):
             paddle.rand(shape=[1, 2048, 1, 1], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_87.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_87.py
index b44fdc4c28783..4e23ab81535de 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_87.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_87.py
@@ -201,6 +201,7 @@ def init(self):
             paddle.rand(shape=[1, 144, 21, 32], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_88.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_88.py
index 425537e634f25..0ed66f4e89e8d 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_88.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_88.py
@@ -79,6 +79,7 @@ def init(self):
             paddle.randint(low=0, high=10, shape=[1, 500], dtype=paddle.int32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_89.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_89.py
index ab1503ef63afa..21faaf7dcad30 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_89.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_89.py
@@ -91,6 +91,7 @@ def init(self):
             paddle.rand(shape=[1, 256, 28, 40], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
 
 
 # if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_9.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_9.py
index e8919aec6e379..7dd68051a5efa 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_9.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_9.py
@@ -90,6 +90,7 @@ def init(self):
             paddle.randint(low=0, high=10, shape=[49, 196], dtype=paddle.int64),
         )
         self.net = LayerCase
+        self.with_train = False
         self.with_cinn = False
 
     # NOTE prim + cinn lead to error
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_90.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_90.py
index e3f28f9775a69..85f937d265d5b 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_90.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_90.py
@@ -65,6 +65,7 @@ def init(self):
         ]
         self.inputs = (paddle.rand(shape=[12], dtype=paddle.float32),)
         self.net = LayerCase
+        self.with_train = False
 
     def set_flags(self):
         # NOTE(Aurelius84): cinn_op.pool2d only support pool_type='avg' under adaptive=True
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_adaptive_avg_pool2d.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_adaptive_avg_pool2d.py
index d4d06895c49ae..1a166fad740a7 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_adaptive_avg_pool2d.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_adaptive_avg_pool2d.py
@@ -48,6 +48,7 @@ def init(self):
             paddle.rand(shape=[22, 480, 7, 7], dtype=paddle.float32),
         )
         self.net = AdaptiveAvgPool2dCase
+        self.with_train = False
 
     # NOTE prim + cinn lead to error
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_add.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_add.py
index c9cf656ad4a0c..9434d1c189373 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_add.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_add.py
@@ -54,6 +54,7 @@ def init(self):
             paddle.rand(shape=[22, 196, 128], dtype=paddle.float32),
         )
         self.net = AddCase
+        self.with_train = False
         self.atol = 1e-8
 
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_add_n.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_add_n.py
index c488de14d12be..18cf5c72f2a50 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_add_n.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_add_n.py
@@ -104,6 +104,7 @@ def init(self):
             paddle.rand(shape=[1], dtype=paddle.float32),
         )
         self.net = AddNCase
+        self.with_train = False
         self.atol = 1e-8
 
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_avg_pool2d.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_avg_pool2d.py
index 0a40ca5079931..957102539eb07 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_avg_pool2d.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_avg_pool2d.py
@@ -56,6 +56,7 @@ def init(self):
             paddle.rand(shape=[22, 128, 56, 56], dtype=paddle.float32),
         )
         self.net = AvgPool2dCase
+        self.with_train = False
         self.atol = 1e-8
         self.with_cinn = False
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_chunk.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_chunk.py
index 36dae471d0d7d..35e12f767dae7 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_chunk.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_chunk.py
@@ -46,6 +46,7 @@ def init(self):
             paddle.rand(shape=[10, 2304, 192], dtype=paddle.float32),
         )
         self.net = ChunkCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_concat.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_concat.py
index f65682e4b0ae9..b298c0870d4bc 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_concat.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_concat.py
@@ -54,6 +54,7 @@ def init(self):
             paddle.rand(shape=[145, 12, 112, 112], dtype=paddle.float32),
         )
         self.net = ConcatCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_conv_nd.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_conv_nd.py
index c189750c9f040..5bdd5b1622a34 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_conv_nd.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_conv_nd.py
@@ -63,6 +63,7 @@ def init(self):
             paddle.rand(shape=[22, 64, 56, 56], dtype=paddle.float32),
         )
         self.net = ConvNdCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_linear.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_linear.py
index 381eb461b6328..c4a358ad4b0bf 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_linear.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_linear.py
@@ -54,6 +54,7 @@ def init(self):
         ]
         self.inputs = (paddle.rand(shape=[10, 64], dtype=paddle.float32),)
         self.net = LinearCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_max_pool2d.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_max_pool2d.py
index 5cd643fc5ef4a..96d2bd54868d1 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_max_pool2d.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_max_pool2d.py
@@ -55,6 +55,7 @@ def init(self):
             paddle.rand(shape=[22, 64, 112, 112], dtype=paddle.float32),
         )
         self.net = MaxPool2dCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_mul_method.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_mul_method.py
index 1e56b482d3736..fa389063a0513 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_mul_method.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_mul_method.py
@@ -54,6 +54,7 @@ def init(self):
             paddle.rand(shape=[22, 1500, 14, 14], dtype=paddle.float32),
         )
         self.net = LayerCase
+        self.with_train = False
         self.atol = 1e-8
 
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_relu6.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_relu6.py
index f628bc19cc9aa..f267c1610f665 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_relu6.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_relu6.py
@@ -47,6 +47,7 @@ def init(self):
             paddle.rand(shape=[22, 144, 56, 56], dtype=paddle.float32),
         )
         self.net = Relu6Case
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_reshape.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_reshape.py
index 5abaff9157d1d..540958310b7cc 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_reshape.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_reshape.py
@@ -44,6 +44,7 @@ def init(self):
         ]
         self.inputs = (paddle.rand(shape=[4312, 640], dtype=paddle.float32),)
         self.net = ReshapeCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_sigmoid.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_sigmoid.py
index 3f77a5c68a93a..a746f3cdd41bc 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_sigmoid.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_sigmoid.py
@@ -46,6 +46,7 @@ def init(self):
             paddle.rand(shape=[10, 512, 1, 1], dtype=paddle.float32),
         )
         self.net = SigmoidCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_split.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_split.py
index b82ec109ca724..57de6d8cb09c0 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_split.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_split.py
@@ -48,6 +48,7 @@ def init(self):
             paddle.rand(shape=[11, 976, 7, 7], dtype=paddle.float32),
         )
         self.net = SplitCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_squeeze_unsqueeze.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_squeeze_unsqueeze.py
index 516d6c6735ff6..4f7438c8a00eb 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_squeeze_unsqueeze.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_squeeze_unsqueeze.py
@@ -51,6 +51,7 @@ def init(self):
         ]
         self.inputs = (paddle.rand(shape=[1, 12, 1, 64], dtype=paddle.float32),)
         self.net = SqueezeCase
+        self.with_train = False
         self.atol = 1e-8
 
 
@@ -66,6 +67,7 @@ def init(self):
         ]
         self.inputs = (paddle.rand(shape=[1, 12, 1, 64], dtype=paddle.float32),)
         self.net = UnsqueezeCase
+        self.with_train = False
         self.atol = 1e-8
 
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_swish.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_swish.py
index 1f7402d0470ed..da572f47bfd94 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_swish.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_swish.py
@@ -46,6 +46,7 @@ def init(self):
             paddle.rand(shape=[43, 32, 112, 112], dtype=paddle.float32),
         )
         self.net = SwishCase
+        self.with_train = False
         self.atol = 1e-8
 
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_transpose.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_transpose.py
index 49a05607e3ae3..51db880532187 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_transpose.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_transpose.py
@@ -46,6 +46,7 @@ def init(self):
             paddle.rand(shape=[22, 4, 224, 224], dtype=paddle.float32),
         )
         self.net = TransposeCase
+        self.with_train = False
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/symbolic/test_dyshape_group_norm.py b/test/ir/pir/cinn/symbolic/test_dyshape_group_norm.py
new file mode 100644
index 0000000000000..a3e9b838eeae4
--- /dev/null
+++ b/test/ir/pir/cinn/symbolic/test_dyshape_group_norm.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import unittest
+from os.path import dirname
+
+import numpy as np
+
+import paddle
+from paddle import nn
+from paddle.static import InputSpec
+
+sys.path.append(dirname(dirname(__file__)))
+
+import utils
+
+
+class GroupNorm(nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.hidden_size = 768
+        self.dtype = "float32"
+        self.weight = paddle.randn([128], dtype=self.dtype)
+        self.weight.stop_gradient = False
+        self.bias = paddle.randn([128], dtype=self.dtype)
+        self.bias.stop_gradient = False
+
+        self.data_format = "NHWC"
+
+    def forward(self, x):
+        return paddle.nn.functional.group_norm(
+            x,
+            num_groups=32,
+            epsilon=1e-6,
+            weight=self.weight,
+            bias=self.bias,
+            data_format=self.data_format,
+        )
+
+
+class TestGroupNorm(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2024)
+        self.shape = [1, 128, 256, 128]
+        self.dtype = "float32"
+        self.data_format = "NHWC"
+        self.prepare_data()
+
+    def prepare_data(self):
+        self.x = paddle.randn(self.shape, dtype=self.dtype)
+        self.x.stop_gradient = False
+
+    def check_jit_kernel_info(self, static_fn):
+        utils.check_jit_kernel_number(static_fn, 2)
+        utils.check_jit_kernel_structure(static_fn, {utils.JIT_KERNEL_NAME: 2})
+
+    def eval(self, use_cinn):
+        paddle.seed(2024)
+        net = GroupNorm()
+        input_spec = [
+            InputSpec(shape=[None, None, None, 128], dtype='float32'),
+        ]
+        net = utils.apply_to_static(net, use_cinn, input_spec)
+        net.eval()
+        out = net(self.x)
+        if use_cinn:
+            self.check_jit_kernel_info(net.forward)
+        return out
+
+    def test_eval(self):
+        cinn_out = self.eval(use_cinn=True)
+        dy_out = self.eval(use_cinn=False)
+        np.testing.assert_allclose(
+            cinn_out.numpy(), dy_out.numpy(), atol=1e-6, rtol=1e-6
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/fused_pass/onednn/test_placement_pass_mean_op.py b/test/ir/pir/fused_pass/onednn/test_placement_pass_mean_op.py
new file mode 100644
index 0000000000000..6443a60c331f9
--- /dev/null
+++ b/test/ir/pir/fused_pass/onednn/test_placement_pass_mean_op.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import numpy as np
+from pass_test import PassTest
+
+import paddle
+
+paddle.enable_static()
+
+
+class TestMeanPlacementPass(PassTest):
+    def is_program_valid(self, program=None):
+        return True
+
+    def build_ir_program(self):
+        with paddle.pir_utils.IrGuard():
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.pir.core.program_guard(main_prog, start_prog):
+                x = paddle.static.data(
+                    name='x', shape=[5, 2, 5, 5], dtype='float32'
+                )
+                mean = paddle.mean(x)
+                out = paddle.assign(mean)
+                self.pass_attr_list = [{'onednn_placement_pass': {}}]
+
+                self.feeds = {
+                    "x": np.random.random((5, 2, 5, 5)).astype("float32"),
+                }
+                self.fetch_list = [out]
+                self.valid_op_map = {
+                    "onednn_op.mean": 1,
+                }
+                return [main_prog, start_prog]
+
+    def sample_program(self):
+        yield self.build_ir_program(), False
+
+    def setUp(self):
+        self.places.append(paddle.CPUPlace())
+
+    def test_check_output(self):
+        self.check_pass_correct()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/ir/pir/fused_pass/pass_test.py b/test/ir/pir/fused_pass/pass_test.py
index 3bb937ec59771..c5066bad6b34f 100644
--- a/test/ir/pir/fused_pass/pass_test.py
+++ b/test/ir/pir/fused_pass/pass_test.py
@@ -69,7 +69,7 @@ def run_program(self, executor, startup_program, main_program):
                 fetches = executor.run(
                     main_program,
                     feed=self.feeds,
-                    fetch_list=self.fetch_list,
+                    fetch_list=main_program.list_vars()[-1],
                 )
                 return fetches
 
diff --git a/test/ir/pir/fused_pass/test_add_norm_fuse_pass.py b/test/ir/pir/fused_pass/test_add_norm_fuse_pass.py
index fac6e62bc2278..2af09ed475b33 100644
--- a/test/ir/pir/fused_pass/test_add_norm_fuse_pass.py
+++ b/test/ir/pir/fused_pass/test_add_norm_fuse_pass.py
@@ -21,8 +21,6 @@
 from paddle.base import core
 from paddle.pir.core import create_parameter
 
-paddle.enable_static()
-
 
 class TestRmsNormFusePattern(PassTest):
     r"""
@@ -284,7 +282,7 @@ class TestAddLayerNormFusePattern(TestRmsNormFusePattern):
     def sample_program(self):
         for x_shape in [[1, 1, 4096]]:
             for w_shape in [[4096]]:
-                for w_type in ['float32']:
+                for x_type in ['float32', 'float16']:
                     for epilson in [1e-6]:
                         with paddle.pir_utils.IrGuard():
                             start_prog = paddle.static.Program()
@@ -295,10 +293,10 @@ def sample_program(self):
                                 residual = paddle.static.data(
                                     name='residual',
                                     shape=x_shape,
-                                    dtype='float32',
+                                    dtype=x_type,
                                 )
                                 x = paddle.static.data(
-                                    name='x', shape=x_shape, dtype='float32'
+                                    name='x', shape=x_shape, dtype=x_type
                                 )
                                 w_attr = paddle.ParamAttr(
                                     learning_rate=0.0,
@@ -306,13 +304,19 @@ def sample_program(self):
                                         mean=0.0, std=2.0
                                     ),
                                 )
+                                b_attr = paddle.ParamAttr(
+                                    learning_rate=0.0,
+                                    initializer=paddle.nn.initializer.Normal(
+                                        mean=0.0, std=2.0
+                                    ),
+                                )
                                 w1 = create_parameter(
                                     name="w1",
                                     shape=w_shape,
-                                    dtype=w_type,
+                                    dtype=x_type,
                                     initializer=paddle.nn.initializer.Assign(
                                         np.random.random([4096, 4096]).astype(
-                                            w_type
+                                            x_type
                                         )
                                     ),
                                 )
@@ -322,6 +326,7 @@ def sample_program(self):
                                     add_out.shape[-1:],
                                     epsilon=epilson,
                                     weight_attr=w_attr,
+                                    bias_attr=b_attr,
                                 )
                                 layer_norm_out = layer_norm(add_out)
                                 matmul_out = paddle.matmul(layer_norm_out, w1)
@@ -332,11 +337,11 @@ def sample_program(self):
                                 ]
                                 self.feeds = {
                                     "x": np.random.random(x_shape).astype(
-                                        "float32"
+                                        x_type
                                     ),
                                     "residual": np.random.random(
                                         x_shape
-                                    ).astype("float32"),
+                                    ).astype(x_type),
                                 }
                                 self.fetch_list = [out]
                                 self.valid_op_map = {
@@ -350,5 +355,290 @@ def test_check_output(self):
         self.check_pass_correct(atol=1e-3, rtol=1e-3)
 
 
+class TestAddGroupNormPattern_FP16(PassTest):
+    r"""
+    x         residual
+    |           |
+         add
+          |
+      group_norm
+    """
+
+    def is_program_valid(self, program=None):
+        return True
+
+    def sample_program(self):
+        for x_shape in [[2, 6, 4, 2]]:
+            for residual_shape in [[1, 6, 1, 1]]:
+                for dtype in ['float16']:
+                    for epilson in [1e-5]:
+                        for groups in [2]:
+                            for data_layout in ['NCHW']:
+                                rand_value = (
+                                    0.001
+                                    * paddle.rand(
+                                        shape=[x_shape[1]], dtype=dtype
+                                    ).numpy()
+                                )
+                                with paddle.pir_utils.IrGuard():
+                                    start_prog = paddle.static.Program()
+                                    main_prog = paddle.static.Program()
+                                    with paddle.pir.core.program_guard(
+                                        main_prog, start_prog
+                                    ):
+                                        residual = paddle.static.data(
+                                            name='residual',
+                                            shape=residual_shape,
+                                            dtype=dtype,
+                                        )
+                                        x = paddle.static.data(
+                                            name='x', shape=x_shape, dtype=dtype
+                                        )
+                                        w = create_parameter(
+                                            shape=[x_shape[1]],
+                                            dtype=dtype,
+                                            initializer=paddle.nn.initializer.Assign(
+                                                rand_value
+                                            ),
+                                        )
+                                        b = create_parameter(
+                                            shape=[residual_shape[1]],
+                                            dtype=dtype,
+                                            initializer=paddle.nn.initializer.Assign(
+                                                rand_value
+                                            ),
+                                        )
+                                        add_out = paddle.add(x, residual)
+
+                                        group_norm_out = (
+                                            paddle.nn.functional.group_norm(
+                                                add_out,
+                                                num_groups=groups,
+                                                epsilon=epilson,
+                                                weight=w,
+                                                bias=b,
+                                                data_format=data_layout,
+                                            )
+                                        )
+                                        out = paddle.assign(group_norm_out)
+                                        self.pass_attr_list = [
+                                            {'add_norm_fuse_pass': {}},
+                                            {'transfer_layout_pass': {}},
+                                            {
+                                                'remove_redundant_transpose_pass': {}
+                                            },
+                                        ]
+                                        self.feeds = {
+                                            "x": np.random.random(
+                                                x_shape
+                                            ).astype(dtype),
+                                            "residual": np.random.random(
+                                                residual_shape
+                                            ).astype(dtype),
+                                        }
+                                        self.fetch_list = [out]
+                                        self.valid_op_map = {
+                                            "pa_op.add": 0,
+                                            "pd_op.group_norm": 0,
+                                            "pd_op.add_group_norm_silu": 1,
+                                        }
+                                        yield [main_prog, start_prog], False
+
+    def setUp(self):
+        if core.is_compiled_with_cuda():
+            self.places.append(paddle.CUDAPlace(0))
+
+    def test_check_output(self):
+        self.check_pass_correct()
+
+
+class TestAddGroupNormPatternSilu_FP16(PassTest):
+    r"""
+    x         residual
+    |           |
+         add
+          |
+      group_norm
+    """
+
+    def is_program_valid(self, program=None):
+        return True
+
+    def sample_program(self):
+        for x_shape in [[2, 6, 4, 2]]:
+            for residual_shape in [[1, 6, 1, 1]]:
+                for dtype in ['float16']:
+                    for epilson in [1e-5]:
+                        for groups in [2]:
+                            for data_layout in ['NCHW']:
+                                rand_value = (
+                                    0.001
+                                    * paddle.rand(
+                                        shape=[x_shape[1]], dtype=dtype
+                                    ).numpy()
+                                )
+                                with paddle.pir_utils.IrGuard():
+                                    start_prog = paddle.static.Program()
+                                    main_prog = paddle.static.Program()
+                                    with paddle.pir.core.program_guard(
+                                        main_prog, start_prog
+                                    ):
+                                        residual = paddle.static.data(
+                                            name='residual',
+                                            shape=residual_shape,
+                                            dtype=dtype,
+                                        )
+                                        x = paddle.static.data(
+                                            name='x', shape=x_shape, dtype=dtype
+                                        )
+                                        w = create_parameter(
+                                            shape=[x_shape[1]],
+                                            dtype=dtype,
+                                            initializer=paddle.nn.initializer.Assign(
+                                                rand_value
+                                            ),
+                                        )
+                                        b = create_parameter(
+                                            shape=[x_shape[1]],
+                                            dtype=dtype,
+                                            initializer=paddle.nn.initializer.Assign(
+                                                rand_value
+                                            ),
+                                        )
+                                        add_out = paddle.add(x, residual)
+                                        group_norm_out = (
+                                            paddle.nn.functional.group_norm(
+                                                add_out,
+                                                num_groups=groups,
+                                                epsilon=epilson,
+                                                weight=w,
+                                                bias=b,
+                                                data_format=data_layout,
+                                            )
+                                        )
+                                        out = paddle.nn.functional.silu(
+                                            group_norm_out
+                                        )
+                                        out = paddle.assign(out)
+                                        self.pass_attr_list = [
+                                            {'add_norm_fuse_pass': {}},
+                                            {'transfer_layout_pass': {}},
+                                            {
+                                                'remove_redundant_transpose_pass': {}
+                                            },
+                                        ]
+                                        self.feeds = {
+                                            "x": np.random.random(
+                                                x_shape
+                                            ).astype(dtype),
+                                            "residual": np.random.random(
+                                                residual_shape
+                                            ).astype(dtype),
+                                        }
+                                        self.fetch_list = [out]
+                                        self.valid_op_map = {
+                                            "pd_op.silu": 0,
+                                            "pd_op.add": 0,
+                                            "pd_op.group_norm": 0,
+                                            "pd_op.add_group_norm_silu": 1,
+                                        }
+                                        yield [main_prog, start_prog], False
+
+    def setUp(self):
+        if core.is_compiled_with_cuda():
+            self.places.append(paddle.CUDAPlace(0))
+
+    def test_check_output(self):
+        self.check_pass_correct()
+
+
+class GroupNormSiluPattern_FP16(PassTest):
+    r"""
+    group_norm
+        |
+      silu
+    """
+
+    def is_program_valid(self, program=None):
+        return True
+
+    def sample_program(self):
+        for x_shape in [[2, 6, 4, 2]]:
+            for residual_shape in [[1, 6, 1, 1]]:
+                for dtype in ['float16']:
+                    for epilson in [1e-5]:
+                        for groups in [2]:
+                            for data_layout in ['NCHW']:
+                                rand_value = (
+                                    0.001
+                                    * paddle.rand(
+                                        shape=[x_shape[1]], dtype=dtype
+                                    ).numpy()
+                                )
+                                with paddle.pir_utils.IrGuard():
+                                    start_prog = paddle.static.Program()
+                                    main_prog = paddle.static.Program()
+                                    with paddle.pir.core.program_guard(
+                                        main_prog, start_prog
+                                    ):
+                                        x = paddle.static.data(
+                                            name='x', shape=x_shape, dtype=dtype
+                                        )
+                                        w = create_parameter(
+                                            shape=[x_shape[1]],
+                                            dtype=dtype,
+                                            initializer=paddle.nn.initializer.Assign(
+                                                rand_value
+                                            ),
+                                        )
+                                        b = create_parameter(
+                                            shape=[x_shape[1]],
+                                            dtype=dtype,
+                                            initializer=paddle.nn.initializer.Assign(
+                                                rand_value
+                                            ),
+                                        )
+                                        group_norm_out = (
+                                            paddle.nn.functional.group_norm(
+                                                x,
+                                                num_groups=groups,
+                                                epsilon=epilson,
+                                                weight=w,
+                                                bias=b,
+                                                data_format=data_layout,
+                                            )
+                                        )
+                                        out = paddle.nn.functional.silu(
+                                            group_norm_out
+                                        )
+                                        out = paddle.assign(out)
+                                        self.pass_attr_list = [
+                                            {'add_norm_fuse_pass': {}},
+                                            {'transfer_layout_pass': {}},
+                                            {
+                                                'remove_redundant_transpose_pass': {}
+                                            },
+                                        ]
+                                        self.feeds = {
+                                            "x": np.random.random(
+                                                x_shape
+                                            ).astype(dtype),
+                                        }
+                                        self.fetch_list = [out]
+                                        self.valid_op_map = {
+                                            "pd_op.silu": 0,
+                                            "pd_op.group_norm": 0,
+                                            "pd_op.add_group_norm_silu": 1,
+                                        }
+                                        yield [main_prog, start_prog], False
+
+    def setUp(self):
+        if core.is_compiled_with_cuda():
+            self.places.append(paddle.CUDAPlace(0))
+
+    def test_check_output(self):
+        self.check_pass_correct()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/ir/test_ir_fusion_group_pass.py b/test/ir/test_ir_fusion_group_pass.py
index 0637efb067f7e..56c723613e939 100644
--- a/test/ir/test_ir_fusion_group_pass.py
+++ b/test/ir/test_ir_fusion_group_pass.py
@@ -72,7 +72,7 @@ def _feed_random_data(self, feed_vars):
             elif var.dtype == paddle.float16:
                 dtype = "float16"
             else:
-                raise ValueError("Unsupported dtype %s" % var.dtype)
+                raise ValueError(f"Unsupported dtype {var.dtype}")
             feeds[var.name] = np.random.random(shape).astype(dtype)
         return feeds
 
diff --git a/test/legacy_test/CMakeLists.txt b/test/legacy_test/CMakeLists.txt
index 8c4cfe9113ab3..f84458dd494f3 100644
--- a/test/legacy_test/CMakeLists.txt
+++ b/test/legacy_test/CMakeLists.txt
@@ -155,6 +155,7 @@ if(WIN32)
   list(REMOVE_ITEM TEST_OPS test_fused_layernorm_op)
   list(REMOVE_ITEM TEST_OPS test_matmul_int8_op)
   list(REMOVE_ITEM TEST_OPS test_variable_length_memory_efficient_attention)
+  list(REMOVE_ITEM TEST_OPS test_ops_nms)
 endif()
 list(REMOVE_ITEM TEST_OPS test_checkpoint_saver)
 
@@ -425,10 +426,6 @@ list(REMOVE_ITEM TEST_OPS test_warpctc_op)
 list(REMOVE_ITEM TEST_OPS test_imperative_resnet)
 list(REMOVE_ITEM TEST_OPS test_imperative_resnet_sorted_gradient)
 list(REMOVE_ITEM TEST_OPS test_imperative_se_resnext)
-list(REMOVE_ITEM TEST_OPS test_parallel_executor_seresnext_base_cpu)
-list(REMOVE_ITEM TEST_OPS test_parallel_executor_seresnext_with_reduce_cpu)
-list(REMOVE_ITEM TEST_OPS
-     test_parallel_executor_seresnext_with_fuse_all_reduce_cpu)
 list(REMOVE_ITEM TEST_OPS test_async_ssa_graph_executor_mnist)
 list(REMOVE_ITEM TEST_OPS test_basic_gru_api)
 list(REMOVE_ITEM TEST_OPS test_basic_gru_unit_op)
@@ -437,6 +434,8 @@ list(REMOVE_ITEM TEST_OPS test_basic_lstm_unit_op)
 list(REMOVE_ITEM TEST_OPS test_fuse_bn_add_act_pass)
 list(REMOVE_ITEM TEST_OPS test_conv3d_transpose_op)
 list(REMOVE_ITEM TEST_OPS test_layers)
+list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op)
+list(REMOVE_ITEM TEST_OPS test_nearest_interp_op)
 
 # disable this unittest temporarily
 list(REMOVE_ITEM TEST_OPS test_imperative_data_loader_exception)
@@ -485,6 +484,8 @@ endif()
 # Some ops need to check results when gc is enabled
 # Currently, only ops that register NoNeedBufferVarsInference need to do this test
 set(TEST_OPS_WITH_GC
+    test_affine_channel_op
+    test_scatter_op
     test_concat_op
     test_elementwise_add_op
     test_lookup_table_op
@@ -571,6 +572,11 @@ if((WITH_GPU) AND (WITH_CUDNN_FRONTEND))
                   test_fused_dot_product_attention_op)
 endif()
 
+py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op ENVS
+                ${GC_ENVS})
+py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op ENVS
+                ${GC_ENVS})
+
 set_tests_properties(test_conv2d_op PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
 set_tests_properties(test_norm_nn_grad PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
 set_tests_properties(test_nn_grad PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
@@ -753,6 +759,7 @@ if(WITH_DISTRIBUTE)
 endif()
 
 # setting timeout value as 15S
+set_tests_properties(test_isin PROPERTIES TIMEOUT 30)
 set_tests_properties(test_binomial_op PROPERTIES TIMEOUT 30)
 set_tests_properties(test_run PROPERTIES TIMEOUT 120)
 set_tests_properties(test_sync_batch_norm_op PROPERTIES TIMEOUT 180)
@@ -788,12 +795,14 @@ if(WITH_NV_JETSON)
   set_tests_properties(test_norm_op PROPERTIES TIMEOUT 1200)
   set_tests_properties(test_batch_norm_op_prim_nchw PROPERTIES TIMEOUT 1500)
   set_tests_properties(test_batch_norm_op_prim_nhwc PROPERTIES TIMEOUT 1500)
+  set_tests_properties(test_pool3d_op PROPERTIES TIMEOUT 1500)
 else()
   set_tests_properties(test_concat_op PROPERTIES TIMEOUT 400)
   set_tests_properties(test_conv3d_op PROPERTIES TIMEOUT 120)
   set_tests_properties(test_norm_op PROPERTIES TIMEOUT 150)
   set_tests_properties(test_batch_norm_op_prim_nchw PROPERTIES TIMEOUT 250)
   set_tests_properties(test_batch_norm_op_prim_nhwc PROPERTIES TIMEOUT 250)
+  set_tests_properties(test_pool3d_op PROPERTIES TIMEOUT 150)
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(test_conv3d_transpose_op MODULES test_conv3d_transpose_op
@@ -946,6 +955,9 @@ if(WITH_CUDNN_FRONTEND)
 endif()
 
 set(TEST_CINN_OPS
+    test_assign_op
+    test_atan2_op
+    test_cast_op
     test_stack_op
     test_activation_op
     test_fill_any_like_op
@@ -954,6 +966,22 @@ set(TEST_CINN_OPS
     test_elementwise_sub_op
     test_elementwise_div_op
     test_elementwise_max_op
+    test_elementwise_mul_op
+    test_elementwise_pow_op
+    test_expand_v2_op
+    test_flatten_contiguous_range_op
+    test_flip
+    test_full_like_op
+    test_top_k_op
+    test_top_k_v2_op
+    test_reshape_op
+    test_triangular_solve_op
+    test_split_op
+    test_scatter_op
+    test_reverse_op
+    test_roll_op
+    test_meshgrid_op
+    test_index_select_op
     test_mean_op
     test_clip_op
     test_gather_op
@@ -997,6 +1025,10 @@ set_tests_properties(
 # These UTs are to temporarily test static build for standalone_executor, will be removed after static build is enabled by default.
 set(STATIC_BUILD_TESTS
     test_adagrad_op
+    test_eigh_op
+    test_matmul_op
+    test_matmul_v2_op
+    test_paddle_save_load_binary
     test_assign_pos_op
     test_bucketize_api
     test_c_embedding_op
@@ -1099,3 +1131,45 @@ set_pir_tests_properties()
 set_tests_properties(test_nadam_op PROPERTIES TIMEOUT 100)
 set_tests_properties(test_radam_op PROPERTIES TIMEOUT 100)
 set_tests_properties(test_nan_inf PROPERTIES TIMEOUT 120)
+set_tests_properties(test_bicubic_interp_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_bilinear_interp_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_conv2d_op_depthwise_conv
+                     PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+set_tests_properties(test_conv2d_op_depthwise_conv PROPERTIES TIMEOUT 120)
+set_tests_properties(test_crop_tensor_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_cross_entropy2_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_cumprod_op PROPERTIES TIMEOUT 300)
+set_tests_properties(test_deformable_conv_v1_op PROPERTIES TIMEOUT 300)
+set_tests_properties(test_elementwise_mul_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_fractional_max_pool2d_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_graph_send_ue_recv_op PROPERTIES TIMEOUT 60)
+set_tests_properties(test_graph_send_uv_op PROPERTIES TIMEOUT 60)
+set_tests_properties(test_gru_op PROPERTIES TIMEOUT 200)
+set_tests_properties(test_gru_unit_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_imperative_star_gan_with_gradient_penalty
+                     PROPERTIES TIMEOUT 120)
+set_tests_properties(test_index_add_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_index_select_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_lstm_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_matmul_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_matmul_v2_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_nearest_interp_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_nearest_interp_v2_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_pad3d_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_paddle_save_load_binary PROPERTIES TIMEOUT 120)
+set_tests_properties(test_partial_concat_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_partial_sum_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_qr_op PROPERTIES TIMEOUT 60)
+set_tests_properties(test_row_conv_op PROPERTIES TIMEOUT 120)
+if(WIN32)
+  set_tests_properties(test_static_save_load_large PROPERTIES TIMEOUT 900)
+else()
+  set_tests_properties(test_static_save_load_large PROPERTIES TIMEOUT 600)
+endif()
+set_tests_properties(test_svd_op PROPERTIES TIMEOUT 80)
+set_tests_properties(test_trilinear_interp_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_trilinear_interp_v2_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_matmul_op_static_build PROPERTIES TIMEOUT 120)
+set_tests_properties(test_matmul_v2_op_static_build PROPERTIES TIMEOUT 120)
+set_tests_properties(test_paddle_save_load_binary_static_build
+                     PROPERTIES TIMEOUT 120)
diff --git a/test/legacy_test/dist_ctr_reader.py b/test/legacy_test/dist_ctr_reader.py
index 23f4daf2a5d8f..039d2c8aaf178 100644
--- a/test/legacy_test/dist_ctr_reader.py
+++ b/test/legacy_test/dist_ctr_reader.py
@@ -114,7 +114,7 @@ def train(self):
         Load trainset.
         '''
         file_name = "train.txt"
-        logger.info("load trainset from %s" % file_name)
+        logger.info(f"load trainset from {file_name}")
         mode = TaskMode.create_train()
         return self._parse_creator(file_name, mode)
 
@@ -123,7 +123,7 @@ def test(self):
         Load testset.
         '''
         file_name = "test.txt"
-        logger.info("load testset from %s" % file_name)
+        logger.info(f"load testset from {file_name}")
         mode = TaskMode.create_test()
         return self._parse_creator(file_name, mode)
 
@@ -132,7 +132,7 @@ def infer(self):
         Load infer set.
         '''
         file_name = "infer.txt"
-        logger.info("load inferset from %s" % file_name)
+        logger.info(f"load inferset from {file_name}")
         mode = TaskMode.create_infer()
         return self._parse_creator(file_name, mode)
 
diff --git a/test/legacy_test/gradient_checker.py b/test/legacy_test/gradient_checker.py
index 210db283b979a..41c668043e3f8 100644
--- a/test/legacy_test/gradient_checker.py
+++ b/test/legacy_test/gradient_checker.py
@@ -324,7 +324,7 @@ def _compute_analytical_jacobian_pir(
     filted_idx, filted_dx = zip(*filted)
 
     # get the name in feeds of dyi
-    name = 'dys_%s' % i
+    name = f'dys_{i}'
     np_t = np.array(feeds[name]).astype(np_type)
     shape = np_t.shape
     np_t = np_t.flatten()
@@ -392,7 +392,7 @@ def fail_test(msg):
     if in_pir_mode():
         analytical = []
         for i in range(len(y)):
-            name = 'dys_%s' % i
+            name = f'dys_{i}'
             feeds.update(
                 {
                     name: np.zeros(
@@ -780,7 +780,7 @@ def get_pir_static_double_grad(
             yi.persistable = True
             np_type = dtype_to_np_dtype(yi.dtype)
             dy = paddle.static.data(
-                name='Dgrad_%s' % i,
+                name=f'Dgrad_{i}',
                 shape=yi.shape,
                 dtype=np_type,
             )
@@ -797,7 +797,7 @@ def get_pir_static_double_grad(
             yi.persistable = True
             np_type = dtype_to_np_dtype(yi.dtype)
             dy = paddle.static.data(
-                name='Dgrad_%s' % i,
+                name=f'Dgrad_{i}',
                 shape=yi.shape,
                 dtype=np_type,
             )
@@ -851,12 +851,12 @@ def get_pir_static_double_grad(
         yi = y[i]
         np_type = dtype_to_np_dtype(yi.dtype)
         dy = paddle.static.data(
-            name='dys_%s' % i,
+            name=f'dys_{i}',
             shape=yi.shape,
             dtype=np_type,
         )
         value = np.ones(yi.shape, dtype=np_type)
-        feeds.update({'dys_%s' % i: value})
+        feeds.update({f'dys_{i}': value})
         dys.append(dy)
 
     # append second order backward
@@ -1130,7 +1130,7 @@ def get_pir_static_triple_grad(
             yi.persistable = True
             np_type = dtype_to_np_dtype(yi.dtype)
             dy = paddle.static.data(
-                name='Tgrad_%s' % i,
+                name=f'Tgrad_{i}',
                 shape=yi.shape,
                 dtype=np_type,
             )
@@ -1147,7 +1147,7 @@ def get_pir_static_triple_grad(
             yi.persistable = True
             np_type = dtype_to_np_dtype(yi.dtype)
             dy = paddle.static.data(
-                name='Tgrad_%s' % i,
+                name=f'Tgrad_{i}',
                 shape=yi.shape,
                 dtype=np_type,
             )
diff --git a/test/legacy_test/op.py b/test/legacy_test/op.py
index 0dec2f001188e..e60a0e63ae8dd 100644
--- a/test/legacy_test/op.py
+++ b/test/legacy_test/op.py
@@ -163,7 +163,7 @@ def __call__(self, *args, **kwargs):
                         new_attr.scalars.MergeFrom(item)
                 else:
                     raise NotImplementedError(
-                        "A not supported attribute type: %s." % (str(attr.type))
+                        f"A not supported attribute type: {str(attr.type)}."
                     )
         for attr_name, defalut_val in self.__extra_attrs__.items():
             user_defined_attr = kwargs.get(attr_name, None)
@@ -212,7 +212,7 @@ def __call__(self, *args, **kwargs):
                         new_attr.scalars.MergeFrom(item)
                 else:
                     raise NotImplementedError(
-                        "A not supported attribute type: %s." % (str(attr_type))
+                        f"A not supported attribute type: {str(attr_type)}."
                     )
 
         return op_desc
@@ -292,7 +292,7 @@ def types(self):
 
     def get_op_info(self, t):
         if t not in self.op_methods:
-            raise ValueError("The operator: %s is not registered." % t)
+            raise ValueError(f"The operator: {t} is not registered.")
         return self.op_methods.get(t)
 
     def get_op_input_names(self, type):
diff --git a/test/legacy_test/op_test.py b/test/legacy_test/op_test.py
index ed4e0f478ed38..eec710f01cf8e 100644
--- a/test/legacy_test/op_test.py
+++ b/test/legacy_test/op_test.py
@@ -114,7 +114,7 @@ def check_out_dtype(api_fn, in_specs, expect_dtypes, target_index=0, **configs):
                         )
                     input_t.append(
                         paddle.static.data(
-                            name='data_%s' % index, shape=shape, dtype=dtype
+                            name=f'data_{index}', shape=shape, dtype=dtype
                         )
                     )
 
@@ -223,7 +223,7 @@ def __get_elem__(tensor, i):
             return tensor._get_complex128_element(i)
         else:
             raise TypeError(
-                "Unsupported test data type %s." % tensor_to_check_dtype
+                f"Unsupported test data type {tensor_to_check_dtype}."
             )
 
     def __set_elem__(tensor, i, e):
@@ -251,7 +251,7 @@ def __set_elem__(tensor, i, e):
             return tensor._set_complex128_element(i, e)
         else:
             raise TypeError(
-                "Unsupported test data type %s." % tensor_to_check_dtype
+                f"Unsupported test data type {tensor_to_check_dtype}."
             )
 
     # we only compute gradient of one element each time.
@@ -501,7 +501,7 @@ def is_complex_test():
                 and not hasattr(cls, "exist_check_grad")
             ):
                 raise AssertionError(
-                    "This test of %s op needs check_grad." % cls.op_type
+                    f"This test of {cls.op_type} op needs check_grad."
                 )
 
             # check for op test with fp64 precision, but not check onednn op test for now
@@ -518,8 +518,7 @@ def is_complex_test():
                 and not cls.check_prim_pir
             ):
                 raise AssertionError(
-                    "This test of %s op needs check_grad with fp64 precision."
-                    % cls.op_type
+                    f"This test of {cls.op_type} op needs check_grad with fp64 precision."
                 )
 
             if (
@@ -1061,7 +1060,7 @@ def create_var(
                     name_temp = name
                 else:
                     nplist_value_temp = np_list[name]
-                    name_temp = unique_name.generate("%s_out" % (name))
+                    name_temp = unique_name.generate(f"{name}_out")
                 v = create_var(
                     nplist_value_temp,
                     name_temp,
@@ -1184,10 +1183,9 @@ def cal_python_api(python_api, args, kernel_sig):
                 return None
             if not hasattr(self, "python_api"):
                 print(kernel_sig)
-            assert hasattr(self, "python_api"), (
-                "Detect there is KernelSignature for `%s` op, please set the `self.python_api` if you set check_dygraph = True"
-                % self.op_type
-            )
+            assert hasattr(
+                self, "python_api"
+            ), f"Detect there is KernelSignature for `{self.op_type}` op, please set the `self.python_api` if you set check_dygraph = True"
             args = OpTestUtils.prepare_python_api_arguments(
                 self.python_api,
                 dygraph_tensor_inputs,
@@ -1288,10 +1286,9 @@ def get_kernel_signature(self, place, egr_inps=None, egr_oups=None):
                 return None
             if not hasattr(self, "python_api"):
                 print(kernel_sig)
-            assert hasattr(self, "python_api"), (
-                "Detect there is KernelSignature for `%s` op, please set the `self.python_api` if you set check_dygraph = True"
-                % self.op_type
-            )
+            assert hasattr(
+                self, "python_api"
+            ), f"Detect there is KernelSignature for `{self.op_type}` op, please set the `self.python_api` if you set check_dygraph = True"
             return kernel_sig
 
     def get_ir_input_attr_dict_and_feed(self, stop_gradient):
@@ -2573,7 +2570,7 @@ def _is_skip_name(self, name):
                 not in no_check_set_white_list.no_check_set_white_list
             ):
                 raise AssertionError(
-                    "no_check_set of op %s must be set to None." % self.op_type
+                    f"no_check_set of op {self.op_type} must be set to None."
                 )
 
         if check_prim:
@@ -3091,7 +3088,7 @@ def check_grad_with_place_for_static(
             analytic_grads,
             inputs_to_check,
             max_relative_error,
-            "Gradient Check On %s" % str(place),
+            f"Gradient Check On {str(place)}",
             atol=atol,
         )
 
@@ -3366,7 +3363,7 @@ def check_grad_with_place(
                     dygraph_dygraph_grad,
                     inputs_to_check,
                     max_relative_error,
-                    "Gradient Check On %s" % str(place),
+                    f"Gradient Check On {str(place)}",
                     atol=atol,
                 )
 
@@ -3406,7 +3403,7 @@ def check_grad_with_place(
                     pir_grad,
                     inputs_to_check,
                     max_relative_error,
-                    "Gradient Check On %s" % str(place),
+                    f"Gradient Check On {str(place)}",
                     atol=atol,
                 )
 
@@ -3484,7 +3481,7 @@ def _get_dygraph_grad(
                         )
                     else:
                         raise TypeError(
-                            "Unsupported test data type %s." % type(cast_input)
+                            f"Unsupported test data type {type(cast_input)}."
                         )
 
                 outputs = {}
@@ -3850,12 +3847,12 @@ def construct_output_dict_by_kernel_sig(ret_tuple, output_sig):
                         range(len(user_defined_grad_outputs)),
                     ):
                         grad_val = paddle.static.data(
-                            name='val_grad_%s' % idx,
+                            name=f'val_grad_{idx}',
                             shape=grad_out_value.shape,
                             dtype=grad_out_value.dtype,
                         )
                         grad_outputs.append(grad_val)
-                        feed.update({'val_grad_%s' % idx: grad_out_value})
+                        feed.update({f'val_grad_{idx}': grad_out_value})
                     # delete the inputs which no need to calculate grad
                     for no_grad_val in no_grad_set:
                         del static_inputs[no_grad_val]
@@ -3894,8 +3891,7 @@ def construct_output_dict_by_kernel_sig(ret_tuple, output_sig):
                             )
                         else:
                             raise TypeError(
-                                "Unsupported test data type %s."
-                                % type(cast_input)
+                                f"Unsupported test data type {type(cast_input)}."
                             )
 
                     outputs = {}
diff --git a/test/legacy_test/prim_op_test.py b/test/legacy_test/prim_op_test.py
index 6894d37a2839a..c059499f43e16 100644
--- a/test/legacy_test/prim_op_test.py
+++ b/test/legacy_test/prim_op_test.py
@@ -100,8 +100,7 @@ def _get_kernel_signature(
             """we think the kernel_sig is missing."""
             kernel_sig = None
             print(
-                "[Warning: op_test.py] Kernel Signature is not found for %s, fall back to intermediate state."
-                % op_type
+                f"[Warning: op_test.py] Kernel Signature is not found for {op_type}, fall back to intermediate state."
             )
         return kernel_sig
 
@@ -677,9 +676,9 @@ def check_static_comp(self):
                 # ensure the operator not in program if check_prim is True
                 if not in_pir_mode():
                     forward_ops = [op.type for op in main_program.blocks[0].ops]
-                    assert self.op_type not in forward_ops, (
-                        "%s shouldn't appear in program when check_prim is True"
-                    ) % (self.op_type)
+                    assert (
+                        self.op_type not in forward_ops
+                    ), f"{self.op_type} shouldn't appear in program when check_prim is True"
                 exe = paddle.static.Executor(self.place)
                 exe.run(startup_program)
                 ret = exe.run(main_program, feed=feed, fetch_list=ret)
@@ -761,9 +760,9 @@ def check_jit_comp(self):
                     .forward_program.block(0)
                     .ops
                 ]
-                assert self.op_type not in forward_ops, (
-                    "%s shouldn't appear in program when check_prim is True"
-                ) % (self.op_type)
+                assert (
+                    self.op_type not in forward_ops
+                ), f"{self.op_type} shouldn't appear in program when check_prim is True"
             ret = flatten(_as_list(net(args)))
             ret = paddle.utils.map_structure(lambda x: x.numpy(), ret)
             if OpTestUtils.is_bfloat16_type(self.dtype):
@@ -854,9 +853,9 @@ def check_jit_comp_with_cinn(self):
                 .forward_program.block(0)
                 .ops
             ]
-            assert self.op_type not in forward_ops, (
-                "%s shouldn't appear in program when check_prim is True"
-            ) % (self.op_type)
+            assert (
+                self.op_type not in forward_ops
+            ), f"{self.op_type} shouldn't appear in program when check_prim is True"
             ret = flatten(_as_list(net(args)))
             ret = paddle.utils.map_structure(lambda x: x.numpy(), ret)
             if OpTestUtils.is_bfloat16_type(self.dtype):
@@ -1160,9 +1159,9 @@ def check_static_comp(self):
                 if not in_pir_mode():
                     ops = [op.type for op in main_program.blocks[0].ops]
                     backward_op_type = self.op_type + "_grad"
-                    assert backward_op_type not in ops, (
-                        "%s shouldn't appear in program when check_prim is True"
-                    ) % (backward_op_type)
+                    assert (
+                        backward_op_type not in ops
+                    ), f"{backward_op_type} shouldn't appear in program when check_prim is True"
                 elif self.prim_op_type == "prim":
                     grad_ops = []
                     for op in main_program.global_block().ops:
@@ -1261,9 +1260,9 @@ def check_jit_comp(self):
                     .ops
                 ]
                 backward_op_type = self.op_type + "_grad"
-                assert backward_op_type not in ops, (
-                    "%s shouldn't appear in program when check_prim is True"
-                ) % (backward_op_type)
+                assert (
+                    backward_op_type not in ops
+                ), f"{backward_op_type} shouldn't appear in program when check_prim is True"
             out = _as_list(net(args))
             if hasattr(self.op_test, "python_out_sig"):
                 outputs_sig = self.op_test.python_out_sig
@@ -1387,9 +1386,9 @@ def check_jit_comp_with_cinn(self):
                 .ops
             ]
             backward_op_type = self.op_type + "_grad"
-            assert backward_op_type not in ops, (
-                "%s shouldn't appear in program when check_prim is True"
-            ) % (backward_op_type)
+            assert (
+                backward_op_type not in ops
+            ), f"{backward_op_type} shouldn't appear in program when check_prim is True"
 
             out = _as_list(net(args))
             if hasattr(self.op_test, "python_out_sig"):
diff --git a/test/legacy_test/test_ZeroPad1d.py b/test/legacy_test/test_ZeroPad1d.py
new file mode 100644
index 0000000000000..31baf6a7cf246
--- /dev/null
+++ b/test/legacy_test/test_ZeroPad1d.py
@@ -0,0 +1,90 @@
+#   Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle import to_tensor
+from paddle.nn import ZeroPad1D
+
+
+class TestZeroPad1dAPI(unittest.TestCase):
+    def setUp(self):
+        if paddle.is_compiled_with_cuda():
+            paddle.device.set_device('gpu:0')
+        else:
+            paddle.device.set_device('cpu')
+        self.shape = [4, 6, 6]
+        self.support_dtypes = ['float32', 'float64', 'int32', 'int64']
+
+    def test_support_dtypes(self):
+        for dtype in self.support_dtypes:
+            pad = 2
+            x = np.random.randint(-255, 255, size=self.shape).astype(dtype)
+            expect_res = np.pad(
+                x,
+                [[0, 0], [0, 0], [pad, pad]],
+                mode='constant',
+                constant_values=0,
+            )
+
+            x_tensor = to_tensor(x).astype(dtype)
+            zeropad1d = ZeroPad1D(padding=pad)
+            ret_res = zeropad1d(x_tensor).numpy()
+            np.testing.assert_allclose(expect_res, ret_res, rtol=1e-05)
+
+    def test_support_pad2(self):
+        pad = [1, 2]
+        x = np.random.randint(-255, 255, size=self.shape)
+        expect_res = np.pad(
+            x, [[0, 0], [0, 0], pad], mode='constant', constant_values=0
+        )
+
+        x_tensor = to_tensor(x)
+        zeropad1d = ZeroPad1D(padding=pad)
+        ret_res = zeropad1d(x_tensor).numpy()
+        np.testing.assert_allclose(expect_res, ret_res, rtol=1e-05)
+
+    def test_support_pad3(self):
+        pad = (1, 2)
+        x = np.random.randint(-255, 255, size=self.shape)
+        expect_res = np.pad(x, [[0, 0], [0, 0], [pad[0], pad[1]]])
+
+        x_tensor = to_tensor(x)
+        zeropad1d = ZeroPad1D(padding=pad)
+        ret_res = zeropad1d(x_tensor).numpy()
+        np.testing.assert_allclose(expect_res, ret_res, rtol=1e-05)
+
+    def test_support_pad4(self):
+        pad = [1, 2]
+        x = np.random.randint(-255, 255, size=self.shape)
+        expect_res = np.pad(x, [[0, 0], [0, 0], [pad[0], pad[1]]])
+
+        x_tensor = to_tensor(x)
+        pad_tensor = to_tensor(pad, dtype='int32')
+        zeropad1d = ZeroPad1D(padding=pad_tensor)
+        ret_res = zeropad1d(x_tensor).numpy()
+        np.testing.assert_allclose(expect_res, ret_res, rtol=1e-05)
+
+    def test_repr(self):
+        pad = [1, 2]
+        zeropad1d = ZeroPad1D(padding=pad)
+        name_str = zeropad1d.extra_repr()
+        assert name_str == 'padding=[1, 2], data_format=NCL'
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/legacy_test/test_ZeroPad3d.py b/test/legacy_test/test_ZeroPad3d.py
new file mode 100644
index 0000000000000..8cc7a45c959df
--- /dev/null
+++ b/test/legacy_test/test_ZeroPad3d.py
@@ -0,0 +1,117 @@
+#   Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle import to_tensor
+from paddle.nn import ZeroPad3D
+
+
+class TestZeroPad3DAPI(unittest.TestCase):
+    def setUp(self):
+        if paddle.is_compiled_with_cuda():
+            paddle.device.set_device('gpu:0')
+        else:
+            paddle.device.set_device('cpu')
+        self.shape = [4, 3, 6, 6, 6]
+        self.support_dtypes = ['float32', 'float64', 'int32', 'int64']
+
+    def test_support_dtypes(self):
+        for dtype in self.support_dtypes:
+            pad = 2
+            x = np.random.randint(-255, 255, size=self.shape).astype(dtype)
+            expect_res = np.pad(
+                x,
+                [[0, 0], [0, 0], [pad, pad], [pad, pad], [pad, pad]],
+                mode='constant',
+                constant_values=0,
+            )
+
+            x_tensor = to_tensor(x).astype(dtype)
+            zeropad3d = ZeroPad3D(padding=pad)
+            ret_res = zeropad3d(x_tensor).numpy()
+            np.testing.assert_allclose(expect_res, ret_res, rtol=1e-05)
+
+    def test_support_pad2(self):
+        pad = [1, 2, 3, 4, 5, 6]
+        x = np.random.randint(-255, 255, size=self.shape)
+        expect_res = np.pad(
+            x,
+            [
+                [0, 0],
+                [0, 0],
+                [pad[4], pad[5]],
+                [pad[2], pad[3]],
+                [pad[0], pad[1]],
+            ],
+            mode='constant',
+            constant_values=0,
+        )
+
+        x_tensor = to_tensor(x)
+        zeropad3d = ZeroPad3D(padding=pad)
+        ret_res = zeropad3d(x_tensor).numpy()
+        np.testing.assert_allclose(expect_res, ret_res, rtol=1e-05)
+
+    def test_support_pad3(self):
+        pad = (1, 2, 3, 4, 5, 6)
+        x = np.random.randint(-255, 255, size=self.shape)
+        expect_res = np.pad(
+            x,
+            [
+                [0, 0],
+                [0, 0],
+                [pad[4], pad[5]],
+                [pad[2], pad[3]],
+                [pad[0], pad[1]],
+            ],
+        )
+
+        x_tensor = to_tensor(x)
+        zeropad3d = ZeroPad3D(padding=pad)
+        ret_res = zeropad3d(x_tensor).numpy()
+        np.testing.assert_allclose(expect_res, ret_res, rtol=1e-05)
+
+    def test_support_pad4(self):
+        pad = [1, 2, 3, 4, 5, 6]
+        x = np.random.randint(-255, 255, size=self.shape)
+        expect_res = np.pad(
+            x,
+            [
+                [0, 0],
+                [0, 0],
+                [pad[4], pad[5]],
+                [pad[2], pad[3]],
+                [pad[0], pad[1]],
+            ],
+        )
+
+        x_tensor = to_tensor(x)
+        pad_tensor = to_tensor(pad, dtype='int32')
+        zeropad3d = ZeroPad3D(padding=pad_tensor)
+        ret_res = zeropad3d(x_tensor).numpy()
+        np.testing.assert_allclose(expect_res, ret_res, rtol=1e-05)
+
+    def test_repr(self):
+        pad = pad = [1, 2, 3, 4, 5, 6]
+        zeropad3d = ZeroPad3D(padding=pad)
+        name_str = zeropad3d.extra_repr()
+        assert name_str == 'padding=[1, 2, 3, 4, 5, 6], data_format=NCDHW'
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_accuracy_op.py b/test/legacy_test/test_accuracy_op.py
similarity index 99%
rename from test/deprecated/legacy_test/test_accuracy_op.py
rename to test/legacy_test/test_accuracy_op.py
index 44c4cfa7c49ac..bf6d86d10da9e 100755
--- a/test/deprecated/legacy_test/test_accuracy_op.py
+++ b/test/legacy_test/test_accuracy_op.py
@@ -126,7 +126,7 @@ def test_type_errors(self):
                 self.assertRaises(TypeError, paddle.metric.accuracy, x2, label)
 
                 x3 = paddle.static.data(
-                    name='input', shape=[-1, 2], dtype="float16"
+                    name='input', shape=[-1, 2], dtype="float32"
                 )
                 paddle.static.accuracy(input=x3, label=label)
                 paddle.metric.accuracy(input=x3, label=label)
diff --git a/test/legacy_test/test_activation_op.py b/test/legacy_test/test_activation_op.py
index 7806017bbfeed..5e727c7580580 100644
--- a/test/legacy_test/test_activation_op.py
+++ b/test/legacy_test/test_activation_op.py
@@ -365,19 +365,19 @@ def test_out_name(self):
                 data = paddle.static.data(
                     name="X", shape=[-1, 1], dtype="float32"
                 )
-                out = eval("paddle.%s(data, name='Y')" % self.op_type)
+                out = eval(f"paddle.{self.op_type}(data, name='Y')")
                 place = base.CPUPlace()
                 exe = base.Executor(place)
                 (result,) = exe.run(feed={"X": np_x}, fetch_list=[out])
-                expected = eval("np.%s(np_x)" % self.op_type)
+                expected = eval(f"np.{self.op_type}(np_x)")
                 np.testing.assert_allclose(result, expected, rtol=1e-05)
 
     def test_dygraph(self):
         with base.dygraph.guard():
             np_x = np.array([0.1])
             x = paddle.to_tensor(np_x)
-            z = eval("paddle.%s(x).numpy()" % self.op_type)
-            z_expected = eval("np.%s(np_x)" % self.op_type)
+            z = eval(f"paddle.{self.op_type}(x).numpy()")
+            z_expected = eval(f"np.{self.op_type}(np_x)")
             np.testing.assert_allclose(z, z_expected, rtol=1e-05)
 
 
@@ -5359,7 +5359,7 @@ def create_test_act_fp16_class(
     enable_cinn=False,
     check_pir=False,
     grad_atol=1e-2,
-    **kwargs
+    **kwargs,
 ):
     @unittest.skipIf(
         not paddle.is_compiled_with_cuda(), "core is not compiled with CUDA"
@@ -5556,7 +5556,7 @@ def create_test_act_bf16_class(
     check_pir=False,
     check_prim_pir=False,
     grad_atol=1e-2,
-    **kwargs
+    **kwargs,
 ):
     @unittest.skipIf(
         not core.is_compiled_with_cuda()
diff --git a/test/deprecated/legacy_test/test_add_position_encoding_op.py b/test/legacy_test/test_add_position_encoding_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_add_position_encoding_op.py
rename to test/legacy_test/test_add_position_encoding_op.py
diff --git a/test/deprecated/legacy_test/test_addmm_op.py b/test/legacy_test/test_addmm_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_addmm_op.py
rename to test/legacy_test/test_addmm_op.py
diff --git a/test/deprecated/legacy_test/test_affine_channel_op.py b/test/legacy_test/test_affine_channel_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_affine_channel_op.py
rename to test/legacy_test/test_affine_channel_op.py
diff --git a/test/deprecated/legacy_test/test_affine_grid_op.py b/test/legacy_test/test_affine_grid_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_affine_grid_op.py
rename to test/legacy_test/test_affine_grid_op.py
diff --git a/test/deprecated/legacy_test/test_assign_op.py b/test/legacy_test/test_assign_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_assign_op.py
rename to test/legacy_test/test_assign_op.py
diff --git a/test/deprecated/legacy_test/test_atan2_op.py b/test/legacy_test/test_atan2_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_atan2_op.py
rename to test/legacy_test/test_atan2_op.py
diff --git a/test/deprecated/legacy_test/test_attribute_var.py b/test/legacy_test/test_attribute_var.py
similarity index 82%
rename from test/deprecated/legacy_test/test_attribute_var.py
rename to test/legacy_test/test_attribute_var.py
index e06e8a3d80d50..cdae49ba0741a 100644
--- a/test/deprecated/legacy_test/test_attribute_var.py
+++ b/test/legacy_test/test_attribute_var.py
@@ -66,43 +66,6 @@ def infer_prog(self):
         return res
 
 
-class TestDropout(UnittestBase):
-    def init_info(self):
-        self.shapes = [[10, 10]]
-        self.save_path = os.path.join(self.temp_dir.name, 'dropout')
-
-    def test_static(self):
-        main_prog = Program()
-        startup_prog = Program()
-        with program_guard(main_prog, startup_prog):
-            fc = paddle.nn.Linear(10, 10)
-            x = paddle.randn(self.shapes[0])
-            x.stop_gradient = False
-            feat = fc(x)
-            # p is a Variable
-            p = paddle.randn([1])
-            out = paddle.nn.functional.dropout(feat, p=p)
-            sgd = paddle.optimizer.SGD()
-            sgd.minimize(paddle.mean(out))
-            # test _to_string
-            self.assertTrue("Var[" in str(main_prog))
-
-            exe = paddle.static.Executor()
-            exe.run(startup_prog)
-            res = exe.run(fetch_list=[x, out])
-            # export model
-            paddle.static.save_inference_model(self.save_path, [x], [out], exe)
-
-            # Test for Inference Predictor
-            infer_out = self.infer_prog()
-            self.assertEqual(infer_out.shape, (10, 10))
-
-            self.assertEqual(
-                main_prog.block(0).ops[4].all_attrs()['dropout_prob'].name,
-                p.name,
-            )
-
-
 class TestTileTensorList(UnittestBase):
     def init_info(self):
         self.shapes = [[2, 3, 4]]
diff --git a/test/deprecated/legacy_test/test_bce_loss.py b/test/legacy_test/test_bce_loss.py
similarity index 100%
rename from test/deprecated/legacy_test/test_bce_loss.py
rename to test/legacy_test/test_bce_loss.py
diff --git a/test/deprecated/legacy_test/test_bicubic_interp_op.py b/test/legacy_test/test_bicubic_interp_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_bicubic_interp_op.py
rename to test/legacy_test/test_bicubic_interp_op.py
diff --git a/test/deprecated/legacy_test/test_bilinear_interp_op.py b/test/legacy_test/test_bilinear_interp_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_bilinear_interp_op.py
rename to test/legacy_test/test_bilinear_interp_op.py
diff --git a/test/legacy_test/test_block_diag.py b/test/legacy_test/test_block_diag.py
new file mode 100644
index 0000000000000..842f360f33c4b
--- /dev/null
+++ b/test/legacy_test/test_block_diag.py
@@ -0,0 +1,95 @@
+#   Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import scipy
+
+import paddle
+from paddle import base
+
+
+class TestBlockDiagError(unittest.TestCase):
+    def test_errors(self):
+        def test_type_error():
+            A = np.array([[1, 2], [3, 4]])
+            B = np.array([[5, 6], [7, 8]])
+            C = np.array([[9, 10], [11, 12]])
+            with paddle.static.program_guard(base.Program()):
+                out = paddle.block_diag([A, B, C])
+
+        self.assertRaises(TypeError, test_type_error)
+
+        def test_dime_error():
+            A = paddle.to_tensor([[[1, 2], [3, 4]]])
+            B = paddle.to_tensor([[[5, 6], [7, 8]]])
+            C = paddle.to_tensor([[[9, 10], [11, 12]]])
+            with paddle.static.program_guard(base.Program()):
+                out = paddle.block_diag([A, B, C])
+
+        self.assertRaises(ValueError, test_dime_error)
+
+
+class TestBlockDiag(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2024)
+        self.type_list = ['int32', 'int64', 'float32', 'float64']
+        self.place = [('cpu', paddle.CPUPlace())] + (
+            [('gpu', paddle.CUDAPlace(0))]
+            if paddle.is_compiled_with_cuda()
+            else []
+        )
+
+    def test_dygraph(self):
+        paddle.disable_static()
+        for device, place in self.place:
+            paddle.set_device(device)
+            for i in self.type_list:
+                A = np.random.randn(2, 3).astype(i)
+                B = np.random.randn(2).astype(i)
+                C = np.random.randn(4, 1).astype(i)
+                s_out = scipy.linalg.block_diag(A, B, C)
+
+                A_tensor = paddle.to_tensor(A)
+                B_tensor = paddle.to_tensor(B)
+                C_tensor = paddle.to_tensor(C)
+                out = paddle.block_diag([A_tensor, B_tensor, C_tensor])
+                np.testing.assert_allclose(out.numpy(), s_out)
+
+    def test_static(self):
+        paddle.enable_static()
+        for device, place in self.place:
+            paddle.set_device(device)
+            for i in self.type_list:
+                A = np.random.randn(2, 3).astype(i)
+                B = np.random.randn(2).astype(i)
+                C = np.random.randn(4, 1).astype(i)
+                s_out = scipy.linalg.block_diag(A, B, C)
+
+                with paddle.static.program_guard(paddle.static.Program()):
+                    A_tensor = paddle.static.data('A', [2, 3], i)
+                    B_tensor = paddle.static.data('B', [2], i)
+                    C_tensor = paddle.static.data('C', [4, 1], i)
+                    out = paddle.block_diag([A_tensor, B_tensor, C_tensor])
+                    exe = paddle.static.Executor(place)
+                    res = exe.run(
+                        feed={'A': A, 'B': B, 'C': C},
+                        fetch_list=[out],
+                    )
+                    np.testing.assert_allclose(res[0], s_out)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_bmm_op.py b/test/legacy_test/test_bmm_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_bmm_op.py
rename to test/legacy_test/test_bmm_op.py
diff --git a/test/deprecated/legacy_test/test_cast_op.py b/test/legacy_test/test_cast_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_cast_op.py
rename to test/legacy_test/test_cast_op.py
diff --git a/test/deprecated/legacy_test/test_channel_shuffle.py b/test/legacy_test/test_channel_shuffle.py
similarity index 100%
rename from test/deprecated/legacy_test/test_channel_shuffle.py
rename to test/legacy_test/test_channel_shuffle.py
diff --git a/test/legacy_test/test_cholesky_op.py b/test/legacy_test/test_cholesky_op.py
index d98596fc29c89..25fc0f9365299 100644
--- a/test/legacy_test/test_cholesky_op.py
+++ b/test/legacy_test/test_cholesky_op.py
@@ -121,14 +121,14 @@ def func(self, place):
                 for i in range(len(out)):
                     yi = out[i]
                     dy = paddle.static.data(
-                        name='dys_%s' % i,
+                        name=f'dys_{i}',
                         shape=yi.shape,
                         dtype=root_data.dtype,
                     )
                     dy.stop_gradient = False
                     dy.persistable = True
                     value = np.zeros(yi.shape, dtype=root_data.dtype)
-                    feeds.update({'dys_%s' % i: value})
+                    feeds.update({f'dys_{i}': value})
                     dys.append(dy)
                 fetch_list = base.gradients(out, root, dys)
             grad_check(
diff --git a/test/legacy_test/test_collective_api_base.py b/test/legacy_test/test_collective_api_base.py
index fa31fe1e16b54..dfc5c36a7eb5a 100644
--- a/test/legacy_test/test_collective_api_base.py
+++ b/test/legacy_test/test_collective_api_base.py
@@ -201,7 +201,7 @@ def setUp(self):
         self._trainers = 2
         self._ps_endpoints = f"127.0.0.1:{self._find_free_port()},127.0.0.1:{self._find_free_port()}"
         self._python_interp = sys.executable
-        self._master_endpoints = "127.0.0.1:%s" % (self._find_free_port())
+        self._master_endpoints = f"127.0.0.1:{self._find_free_port()}"
 
         self.temp_dir = tempfile.TemporaryDirectory()
 
@@ -305,15 +305,15 @@ def _run_cluster(self, model_file, envs):
 
         tr0_out, tr0_err = tr0_proc.communicate()
         tr1_out, tr1_err = tr1_proc.communicate()
-        sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err)
-        sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err)
+        sys.stderr.write(f'trainer 0 stderr: {tr0_err}\n')
+        sys.stderr.write(f'trainer 1 stderr: {tr1_err}\n')
         # close trainer file
         tr0_pipe.close()
         tr1_pipe.close()
         with open(path0, "r") as f:
-            sys.stderr.write('trainer 0 stderr file: %s\n' % f.read())
+            sys.stderr.write(f'trainer 0 stderr file: {f.read()}\n')
         with open(path1, "r") as f:
-            sys.stderr.write('trainer 1 stderr file: %s\n' % f.read())
+            sys.stderr.write(f'trainer 1 stderr file: {f.read()}\n')
 
         def load_and_remove(path):
             with open(path, 'rb') as f:
diff --git a/test/legacy_test/test_collective_base.py b/test/legacy_test/test_collective_base.py
index b11b992bcd5f8..07573f6ce7e00 100644
--- a/test/legacy_test/test_collective_base.py
+++ b/test/legacy_test/test_collective_base.py
@@ -232,8 +232,8 @@ def _run_cluster(self, model_file, envs):
 
         tr0_out, tr0_err = tr0_proc.communicate()
         tr1_out, tr1_err = tr1_proc.communicate()
-        sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err)
-        sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err)
+        sys.stderr.write(f'trainer 0 stderr: {tr0_err}\n')
+        sys.stderr.write(f'trainer 1 stderr: {tr1_err}\n')
         # close trainer file
         tr0_pipe.close()
         tr1_pipe.close()
diff --git a/test/deprecated/legacy_test/test_complex_abs.py b/test/legacy_test/test_complex_abs.py
similarity index 100%
rename from test/deprecated/legacy_test/test_complex_abs.py
rename to test/legacy_test/test_complex_abs.py
diff --git a/test/deprecated/legacy_test/test_complex_op.py b/test/legacy_test/test_complex_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_complex_op.py
rename to test/legacy_test/test_complex_op.py
diff --git a/test/deprecated/legacy_test/test_complex_variable.py b/test/legacy_test/test_complex_variable.py
similarity index 100%
rename from test/deprecated/legacy_test/test_complex_variable.py
rename to test/legacy_test/test_complex_variable.py
diff --git a/test/deprecated/legacy_test/test_complex_view_op.py b/test/legacy_test/test_complex_view_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_complex_view_op.py
rename to test/legacy_test/test_complex_view_op.py
diff --git a/test/deprecated/legacy_test/test_conj_op.py b/test/legacy_test/test_conj_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_conj_op.py
rename to test/legacy_test/test_conj_op.py
diff --git a/test/legacy_test/test_conv2d_op.py b/test/legacy_test/test_conv2d_op.py
index a3bfa75d1225f..b0b0d0abe2d96 100644
--- a/test/legacy_test/test_conv2d_op.py
+++ b/test/legacy_test/test_conv2d_op.py
@@ -34,14 +34,14 @@ def conv2d_forward_naive(
 ):
     if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]:
         raise ValueError(
-            "Unknown Attr(padding_algorithm): '%s'. "
-            "It can only be 'SAME' or 'VALID'." % str(padding_algorithm)
+            f"Unknown Attr(padding_algorithm): '{str(padding_algorithm)}'. "
+            "It can only be 'SAME' or 'VALID'."
         )
 
     if data_format not in ["NCHW", "NHWC"]:
         raise ValueError(
-            "Unknown Attr(data_format): '%s' ."
-            "It can only be 'NCHW' or 'NHWC'." % str(data_format)
+            f"Unknown Attr(data_format): '{str(data_format)}' ."
+            "It can only be 'NCHW' or 'NHWC'."
         )
 
     channel_last = data_format == "NHWC"
diff --git a/test/deprecated/legacy_test/test_conv2d_op_depthwise_conv.py b/test/legacy_test/test_conv2d_op_depthwise_conv.py
similarity index 100%
rename from test/deprecated/legacy_test/test_conv2d_op_depthwise_conv.py
rename to test/legacy_test/test_conv2d_op_depthwise_conv.py
diff --git a/test/legacy_test/test_conv2d_transpose_op.py b/test/legacy_test/test_conv2d_transpose_op.py
index 36796adfdaec2..dd14afecf09ec 100644
--- a/test/legacy_test/test_conv2d_transpose_op.py
+++ b/test/legacy_test/test_conv2d_transpose_op.py
@@ -37,8 +37,8 @@ def conv2dtranspose_forward_naive(input_, filter_, attrs):
     padding_algorithm = attrs['padding_algorithm']
     if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]:
         raise ValueError(
-            "Unknown Attr(padding_algorithm): '%s'. "
-            "It can only be 'SAME' or 'VALID'." % str(padding_algorithm)
+            f"Unknown Attr(padding_algorithm): '{str(padding_algorithm)}'. "
+            "It can only be 'SAME' or 'VALID'."
         )
 
     if attrs['data_format'] == 'NHWC':
diff --git a/test/deprecated/legacy_test/test_conv2d_transpose_op_depthwise_conv.py b/test/legacy_test/test_conv2d_transpose_op_depthwise_conv.py
similarity index 100%
rename from test/deprecated/legacy_test/test_conv2d_transpose_op_depthwise_conv.py
rename to test/legacy_test/test_conv2d_transpose_op_depthwise_conv.py
diff --git a/test/legacy_test/test_conv3d_op.py b/test/legacy_test/test_conv3d_op.py
index 143deb493c756..a41580c7b0445 100644
--- a/test/legacy_test/test_conv3d_op.py
+++ b/test/legacy_test/test_conv3d_op.py
@@ -37,14 +37,14 @@ def conv3d_forward_naive(
 ):
     if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]:
         raise ValueError(
-            "Unknown Attr(padding_algorithm): '%s'. "
-            "It can only be 'SAME' or 'VALID'." % str(padding_algorithm)
+            f"Unknown Attr(padding_algorithm): '{str(padding_algorithm)}'. "
+            "It can only be 'SAME' or 'VALID'."
         )
 
     if data_format not in ["NCDHW", "NDHWC"]:
         raise ValueError(
-            "Unknown Attr(data_format): '%s' ."
-            "It can only be 'NCDHW' or 'NDHWC'." % str(data_format)
+            f"Unknown Attr(data_format): '{str(data_format)}' ."
+            "It can only be 'NCDHW' or 'NDHWC'."
         )
 
     channel_last = data_format == "NDHWC"
diff --git a/test/legacy_test/test_conv3d_transpose_op.py b/test/legacy_test/test_conv3d_transpose_op.py
index 78d88d53ff500..9e6f3445eaf99 100644
--- a/test/legacy_test/test_conv3d_transpose_op.py
+++ b/test/legacy_test/test_conv3d_transpose_op.py
@@ -42,8 +42,8 @@ def conv3dtranspose_forward_naive(input_, filter_, attrs):
     padding_algorithm = attrs['padding_algorithm']
     if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]:
         raise ValueError(
-            "Unknown Attr(padding_algorithm): '%s'. "
-            "It can only be 'SAME' or 'VALID'." % str(padding_algorithm)
+            f"Unknown Attr(padding_algorithm): '{str(padding_algorithm)}'. "
+            "It can only be 'SAME' or 'VALID'."
         )
 
     if attrs['data_format'] == 'NHWC':
diff --git a/test/deprecated/legacy_test/test_copysign_op.py b/test/legacy_test/test_copysign_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_copysign_op.py
rename to test/legacy_test/test_copysign_op.py
diff --git a/test/deprecated/legacy_test/test_crop_tensor_op.py b/test/legacy_test/test_crop_tensor_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_crop_tensor_op.py
rename to test/legacy_test/test_crop_tensor_op.py
diff --git a/test/deprecated/legacy_test/test_cross_entropy2_op.py b/test/legacy_test/test_cross_entropy2_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_cross_entropy2_op.py
rename to test/legacy_test/test_cross_entropy2_op.py
diff --git a/test/deprecated/legacy_test/test_cross_entropy_op.py b/test/legacy_test/test_cross_entropy_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_cross_entropy_op.py
rename to test/legacy_test/test_cross_entropy_op.py
diff --git a/test/deprecated/legacy_test/test_cummax_op.py b/test/legacy_test/test_cummax_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_cummax_op.py
rename to test/legacy_test/test_cummax_op.py
diff --git a/test/deprecated/legacy_test/test_cumprod_op.py b/test/legacy_test/test_cumprod_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_cumprod_op.py
rename to test/legacy_test/test_cumprod_op.py
diff --git a/test/deprecated/legacy_test/test_deformable_conv_v1_op.py b/test/legacy_test/test_deformable_conv_v1_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_deformable_conv_v1_op.py
rename to test/legacy_test/test_deformable_conv_v1_op.py
diff --git a/test/deprecated/legacy_test/test_determinant_op.py b/test/legacy_test/test_determinant_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_determinant_op.py
rename to test/legacy_test/test_determinant_op.py
diff --git a/test/deprecated/legacy_test/test_diagonal_op.py b/test/legacy_test/test_diagonal_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_diagonal_op.py
rename to test/legacy_test/test_diagonal_op.py
diff --git a/test/legacy_test/test_dist_base.py b/test/legacy_test/test_dist_base.py
index 0abf18fe42c87..143f7e1ee8e62 100755
--- a/test/legacy_test/test_dist_base.py
+++ b/test/legacy_test/test_dist_base.py
@@ -1040,7 +1040,7 @@ def __free_port():
             ) as s:
                 s.bind(('', 0))
                 print_to_err(
-                    type(self).__name__, "socket name: %s" % s.getsockname()[1]
+                    type(self).__name__, f"socket name: {s.getsockname()[1]}"
                 )
                 return s.getsockname()[1]
 
@@ -1479,10 +1479,9 @@ def _get_nccl2_trainer_cmd(
     def _run_cluster_gloo(
         self, model, envs, update_method, check_error_log, log_name
     ):
-        assert update_method == "gloo", (
-            "_run_cluster_gloo must have update_method: gloo, but get %s"
-            % update_method
-        )
+        assert (
+            update_method == "gloo"
+        ), f"_run_cluster_gloo must have update_method: gloo, but get {update_method}"
         assert (
             not self._use_hallreduce
         ), "_run_cluster_gloo must have _use_hallreduce = false"
@@ -1551,9 +1550,7 @@ def _run_cluster_nccl2(
             if DIST_UT_PORT == 0:
                 # NOTE(wangxi). hallreduce test must use 4cards after nccl>=2.7
                 for i in range(0, 4):
-                    self._ps_endpoints += "127.0.0.1:%s," % (
-                        self._find_free_port()
-                    )
+                    self._ps_endpoints += f"127.0.0.1:{self._find_free_port()},"
             else:
                 for i in range(0, 4):
                     self._ps_endpoints += "127.0.0.1:%s," % (DIST_UT_PORT + i)
diff --git a/test/legacy_test/test_dist_hapi_model.py b/test/legacy_test/test_dist_hapi_model.py
index 03a92d6f3cbc9..e41f5b344a594 100644
--- a/test/legacy_test/test_dist_hapi_model.py
+++ b/test/legacy_test/test_dist_hapi_model.py
@@ -70,9 +70,11 @@ def start_local_trainers(
     procs = []
     for t in pod.trainers:
         proc_env = {
-            "FLAGS_selected_gpus": "%s" % ",".join([str(g) for g in t.gpus]),
+            "FLAGS_selected_gpus": "{}".format(
+                ",".join([str(g) for g in t.gpus])
+            ),
             "PADDLE_TRAINER_ID": "%d" % t.rank,
-            "PADDLE_CURRENT_ENDPOINT": "%s" % t.endpoint,
+            "PADDLE_CURRENT_ENDPOINT": f"{t.endpoint}",
             "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
             "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()),
             "FLAGS_dynamic_static_unified_comm": "0",
diff --git a/test/deprecated/legacy_test/test_eigh_op.py b/test/legacy_test/test_eigh_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_eigh_op.py
rename to test/legacy_test/test_eigh_op.py
diff --git a/test/deprecated/legacy_test/test_elementwise_heaviside_op.py b/test/legacy_test/test_elementwise_heaviside_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_elementwise_heaviside_op.py
rename to test/legacy_test/test_elementwise_heaviside_op.py
diff --git a/test/deprecated/legacy_test/test_elementwise_mul_op.py b/test/legacy_test/test_elementwise_mul_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_elementwise_mul_op.py
rename to test/legacy_test/test_elementwise_mul_op.py
diff --git a/test/deprecated/legacy_test/test_elementwise_pow_op.py b/test/legacy_test/test_elementwise_pow_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_elementwise_pow_op.py
rename to test/legacy_test/test_elementwise_pow_op.py
diff --git a/test/deprecated/legacy_test/test_expand_v2_op.py b/test/legacy_test/test_expand_v2_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_expand_v2_op.py
rename to test/legacy_test/test_expand_v2_op.py
diff --git a/test/deprecated/legacy_test/test_fill_any_op.py b/test/legacy_test/test_fill_any_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_fill_any_op.py
rename to test/legacy_test/test_fill_any_op.py
diff --git a/test/deprecated/legacy_test/test_fill_diagonal_tensor_op.py b/test/legacy_test/test_fill_diagonal_tensor_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_fill_diagonal_tensor_op.py
rename to test/legacy_test/test_fill_diagonal_tensor_op.py
diff --git a/test/deprecated/legacy_test/test_flatten_contiguous_range_op.py b/test/legacy_test/test_flatten_contiguous_range_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_flatten_contiguous_range_op.py
rename to test/legacy_test/test_flatten_contiguous_range_op.py
diff --git a/test/deprecated/legacy_test/test_flip.py b/test/legacy_test/test_flip.py
similarity index 100%
rename from test/deprecated/legacy_test/test_flip.py
rename to test/legacy_test/test_flip.py
diff --git a/test/deprecated/legacy_test/test_fmax_op.py b/test/legacy_test/test_fmax_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_fmax_op.py
rename to test/legacy_test/test_fmax_op.py
diff --git a/test/deprecated/legacy_test/test_fmin_op.py b/test/legacy_test/test_fmin_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_fmin_op.py
rename to test/legacy_test/test_fmin_op.py
diff --git a/test/deprecated/legacy_test/test_fold_op.py b/test/legacy_test/test_fold_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_fold_op.py
rename to test/legacy_test/test_fold_op.py
diff --git a/test/deprecated/legacy_test/test_fractional_max_pool2d_op.py b/test/legacy_test/test_fractional_max_pool2d_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_fractional_max_pool2d_op.py
rename to test/legacy_test/test_fractional_max_pool2d_op.py
diff --git a/test/deprecated/legacy_test/test_full_like_op.py b/test/legacy_test/test_full_like_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_full_like_op.py
rename to test/legacy_test/test_full_like_op.py
diff --git a/test/legacy_test/test_fused_groupnorm.py b/test/legacy_test/test_fused_groupnorm.py
new file mode 100644
index 0000000000000..5dbaa4d5a569d
--- /dev/null
+++ b/test/legacy_test/test_fused_groupnorm.py
@@ -0,0 +1,321 @@
+#  Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import numpy as np
+
+import paddle
+import paddle.nn.functional as F
+from paddle import base
+from paddle.base import core
+from paddle.base.layer_helper import LayerHelper
+
+
+def naive_residual_add(x, residual):
+    return np.add(x, residual)
+
+
+def naive_group_norm(x, scale, bias, epsilon, groups, data_layout):
+    dim = x.ndim
+    if dim == 3:
+        if data_layout == "NHWC":
+            x = np.transpose(x, (0, 2, 1))  # NLC => NCL
+        N, C, L = x.shape
+        G = groups
+        x = x.reshape((N * G, -1))
+        mean = np.mean(x, axis=1, keepdims=True)
+        var = np.var(x, axis=1, keepdims=True)
+        output = (x - mean) / np.sqrt(var + epsilon)
+        output = output.reshape((N, C, L)) * scale.reshape(
+            (-1, 1)
+        ) + bias.reshape((-1, 1))
+        if data_layout == "NHWC":
+            output = np.transpose(output, (0, 2, 1))  # NCL => NLC
+        return [output, mean.reshape((N, G)), var.reshape((N, G))]
+    elif dim == 4:
+        if data_layout == "NHWC":
+            x = np.transpose(x, (0, 3, 1, 2))  # NHWC => NCHW
+        N, C, H, W = x.shape
+        G = groups
+        x = x.reshape((N * G, -1))
+        mean = np.mean(x, axis=1, keepdims=True)
+        var = np.var(x, axis=1, keepdims=True)
+        output = (x - mean) / np.sqrt(var + epsilon)
+        output = output.reshape((N, C, H, W)) * scale.reshape(
+            (-1, 1, 1)
+        ) + bias.reshape((-1, 1, 1))
+        if data_layout == "NHWC":
+            output = np.transpose(output, (0, 2, 3, 1))  # NCHW => NHWC
+        return [output, mean.reshape((N, G)), var.reshape((N, G))]
+    else:
+        if data_layout == "NHWC":
+            x = np.transpose(x, (0, 4, 1, 2, 3))  # NDHWC => NCDHW
+        N, C, D, H, W = x.shape
+        G = groups
+        x = x.reshape((N * G, -1))
+        mean = np.mean(x, axis=1, keepdims=True)
+        var = np.var(x, axis=1, keepdims=True)
+        output = (x - mean) / np.sqrt(var + epsilon)
+        output = output.reshape((N, C, D, H, W)) * scale.reshape(
+            (-1, 1, 1, 1)
+        ) + bias.reshape((-1, 1, 1, 1))
+        if data_layout == "NHWC":
+            output = np.transpose(output, (0, 2, 3, 4, 1))  # NCDHW => NDHWC
+        return [output, mean.reshape((N, G)), var.reshape((N, G))]
+
+
+def naive_residual_biasadd_layer_norm(
+    x, residual, scale, bias, epsilon, groups, data_layout, activation
+):
+    x = x + residual
+    out = naive_group_norm(x, scale, bias, epsilon, groups, data_layout)
+    if activation == "silu":
+        out[0] = F.silu(paddle.to_tensor(out[0])).numpy()
+    return out
+
+
+def add_group_norm_silu_static_wrapper(
+    x, residual, scale, bias, epsilon, groups, data_layout="NHWC", activation=""
+):
+    helper = LayerHelper('add_group_norm_silu', **locals())
+    mean_out = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True
+    )
+    variance_out = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True
+    )
+
+    inputs = {'x': x}
+    if bias is not None:
+        inputs['bias'] = bias
+    if scale is not None:
+        inputs['scale'] = scale
+    if residual is not None:
+        inputs['residual'] = residual
+
+    # create output
+    group_norm_out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    residual_out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type="add_group_norm_silu",
+        inputs=inputs,
+        outputs={
+            "y": group_norm_out,
+            "residual_out": residual_out,
+            "mean": mean_out,
+            "variance": variance_out,
+        },
+        attrs={
+            "epsilon": epsilon,
+            "groups": groups,
+            "data_format": data_layout,
+            "activation": activation,
+        },
+    )
+
+    return group_norm_out, residual_out
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_float16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support the bfloat16",
+)
+class TestGroupNormNHWC_StaticOp(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(20)
+        self.shape = (2, 4, 2, 6)
+        self.r_shape = (1, 1, 1, 6)
+        self.x_np = np.random.uniform(-0.05, 0.05, self.shape)
+        self.residual_np = np.random.uniform(-0.05, 0.05, self.r_shape)
+        self.scale_np = np.random.uniform(-0.05, 0.05, [self.shape[-1]])
+        self.bias_np = np.random.uniform(-0.05, 0.05, [self.shape[-1]])
+        self.epsilon = 1e-5
+        self.groups = 2
+        self.data_layout = 'NHWC'
+        self.activation = ''
+        self.place = paddle.CUDAPlace(0)
+
+    def check_residual_add_groupnorm(
+        self, x_np, scale_np, bias_np, residual_np, activation, dtype
+    ):
+        paddle.disable_static()
+        navie_groupnorm_out = naive_residual_biasadd_layer_norm(
+            x_np,
+            residual_np,
+            scale_np,
+            bias_np,
+            self.epsilon,
+            self.groups,
+            self.data_layout,
+            self.activation,
+        )
+        navie_residual_out = naive_residual_add(x_np, residual_np)
+        paddle.enable_static()
+
+        with paddle.static.program_guard(paddle.static.Program()):
+            x_static = paddle.static.data(
+                name="x_static", shape=self.shape, dtype=dtype
+            )
+            residual_static = paddle.static.data(
+                name="residual_static",
+                shape=self.r_shape,
+                dtype=dtype,
+            )
+
+            scale_static = paddle.static.data(
+                name="scale_static", shape=[self.shape[-1]], dtype=dtype
+            )
+            bias_static = paddle.static.data(
+                name="bias_static", shape=[self.shape[-1]], dtype=dtype
+            )
+            outs = add_group_norm_silu_static_wrapper(
+                x_static,
+                residual_static,
+                scale_static,
+                bias_static,
+                self.epsilon,
+                self.groups,
+                self.data_layout,
+                activation,
+            )
+
+            exe = base.Executor(self.place)
+            out_s = exe.run(
+                feed={
+                    "x_static": x_np.astype(dtype),
+                    "scale_static": scale_np.astype(dtype),
+                    "residual_static": residual_np.astype(dtype),
+                    "bias_static": bias_np.astype(dtype),
+                },
+                fetch_list=[outs],
+            )
+        return (out_s[0], out_s[1]), navie_groupnorm_out, navie_residual_out
+
+    def test_residual_add_groupnorm_fp16(self):
+        if not paddle.is_compiled_with_cuda():
+            return
+        self.dtype = np.float16
+        (
+            paddle_group_list,
+            paddle_naive_group_out,
+            paddle_naive_group_residual,
+        ) = self.check_residual_add_groupnorm(
+            self.x_np.astype(self.dtype),
+            self.scale_np.astype(self.dtype),
+            self.bias_np.astype(self.dtype),
+            self.residual_np.astype(self.dtype),
+            self.activation,
+            self.dtype,
+        )
+        np.testing.assert_allclose(
+            paddle_group_list[1],
+            paddle_naive_group_residual,
+            rtol=1e-5,
+            atol=1e-5,
+        )
+        np.testing.assert_allclose(
+            paddle_group_list[0],
+            paddle_naive_group_out[0],
+            rtol=1e-4,
+            atol=1e-4,
+        )
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_float16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support the bfloat16",
+)
+class TestGroupNormNHWCSilu_StaticOp(TestGroupNormNHWC_StaticOp):
+    def setUp(self):
+        np.random.seed(20)
+        self.shape = (2, 4, 2, 6)
+        self.r_shape = (1, 1, 1, 6)
+        self.x_np = np.random.uniform(-0.05, 0.05, self.shape)
+        self.residual_np = np.random.uniform(-0.05, 0.05, self.r_shape)
+        self.scale_np = np.random.uniform(-0.05, 0.05, [self.shape[-1]])
+        self.bias_np = np.random.uniform(-0.05, 0.05, [self.shape[-1]])
+        self.epsilon = 1e-5
+        self.groups = 2
+        self.data_layout = 'NHWC'
+        self.activation = 'silu'
+        self.place = paddle.CUDAPlace(0)
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_float16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support the bfloat16",
+)
+class TestGroupNormNHWC_StaticOp_1(TestGroupNormNHWC_StaticOp):
+    def setUp(self):
+        np.random.seed(20)
+        self.shape = (2, 4, 2, 6)
+        self.r_shape = (2, 4, 2, 6)
+        self.x_np = np.random.uniform(-0.05, 0.05, self.shape)
+        self.residual_np = np.random.uniform(-0.05, 0.05, self.r_shape)
+        self.scale_np = np.random.uniform(-0.05, 0.05, [self.shape[-1]])
+        self.bias_np = np.random.uniform(-0.05, 0.05, [self.shape[-1]])
+        self.epsilon = 1e-5
+        self.groups = 2
+        self.data_layout = 'NHWC'
+        self.activation = 'silu'
+        self.place = paddle.CUDAPlace(0)
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_float16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support the bfloat16",
+)
+class TestGroupNormNHWCSilu_StaticOp_1(TestGroupNormNHWC_StaticOp):
+    def setUp(self):
+        np.random.seed(20)
+        self.shape = (2, 4, 2, 6)
+        self.r_shape = (2, 4, 2, 6)
+        self.x_np = np.random.uniform(-0.05, 0.05, self.shape)
+        self.residual_np = np.random.uniform(-0.05, 0.05, self.r_shape)
+        self.scale_np = np.random.uniform(-0.05, 0.05, [self.shape[-1]])
+        self.bias_np = np.random.uniform(-0.05, 0.05, [self.shape[-1]])
+        self.epsilon = 1e-5
+        self.groups = 2
+        self.data_layout = 'NHWC'
+        self.activation = 'silu'
+        self.place = paddle.CUDAPlace(0)
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_float16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support the bfloat16",
+)
+class TestGroupNormNHWCSingleC_StaticOp(TestGroupNormNHWC_StaticOp):
+    def setUp(self):
+        np.random.seed(20)
+        self.shape = (2, 4, 2, 6)
+        self.r_shape = (2, 4, 2, 6)
+        self.x_np = np.random.uniform(-0.05, 0.05, self.shape)
+        self.residual_np = np.random.uniform(-0.05, 0.05, self.r_shape)
+        self.scale_np = np.random.uniform(-0.05, 0.05, [self.shape[-1]])
+        self.bias_np = np.random.uniform(-0.05, 0.05, [self.shape[-1]])
+        self.epsilon = 1e-5
+        self.groups = 6
+        self.data_layout = 'NHWC'
+        self.activation = ''
+        self.place = paddle.CUDAPlace(0)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_gammaln_op.py b/test/legacy_test/test_gammaln_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_gammaln_op.py
rename to test/legacy_test/test_gammaln_op.py
diff --git a/test/deprecated/legacy_test/test_gaussian_random_op.py b/test/legacy_test/test_gaussian_random_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_gaussian_random_op.py
rename to test/legacy_test/test_gaussian_random_op.py
diff --git a/test/deprecated/legacy_test/test_graph_send_recv_op.py b/test/legacy_test/test_graph_send_recv_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_graph_send_recv_op.py
rename to test/legacy_test/test_graph_send_recv_op.py
diff --git a/test/deprecated/legacy_test/test_graph_send_ue_recv_op.py b/test/legacy_test/test_graph_send_ue_recv_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_graph_send_ue_recv_op.py
rename to test/legacy_test/test_graph_send_ue_recv_op.py
diff --git a/test/deprecated/legacy_test/test_graph_send_uv_op.py b/test/legacy_test/test_graph_send_uv_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_graph_send_uv_op.py
rename to test/legacy_test/test_graph_send_uv_op.py
diff --git a/test/deprecated/legacy_test/test_grid_sampler_op.py b/test/legacy_test/test_grid_sampler_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_grid_sampler_op.py
rename to test/legacy_test/test_grid_sampler_op.py
diff --git a/test/legacy_test/test_group_norm_op.py b/test/legacy_test/test_group_norm_op.py
index f097df3b0b99c..7a6f57cc61ece 100644
--- a/test/legacy_test/test_group_norm_op.py
+++ b/test/legacy_test/test_group_norm_op.py
@@ -209,7 +209,7 @@ def do_compare_between_place(self):
             gpu_grads,
             inputs_to_check,
             0.005,
-            "Gradient Check On %s" % str(place),
+            f"Gradient Check On {str(place)}",
         )
 
     def test_check_grad(self):
@@ -1748,7 +1748,7 @@ def test_jit_comp(self):
                 fwd_actual[i],
                 rtol=rtol,
                 atol=atol,
-                err_msg='%s jit fwd' % self.places[i],
+                err_msg=f'{self.places[i]} jit fwd',
             )
 
             # TODO: fix the diff between cpu and gpu grad is large in original op
@@ -1762,7 +1762,7 @@ def test_jit_comp(self):
                 rev_actual[i],
                 rtol=rtol,
                 atol=atol,
-                err_msg='%s jit rev' % self.places[i],
+                err_msg=f'{self.places[i]} jit rev',
             )
 
     def test_jit_comp_with_cinn(self):
@@ -1820,7 +1820,7 @@ def test_jit_comp_with_cinn(self):
                 fwd_actual[i],
                 rtol=rtol,  # mean of uniform distribution, scale for avoid random failed
                 atol=atol,
-                err_msg='%s jit_cinn fwd' % self.places[i],
+                err_msg=f'{self.places[i]} jit_cinn fwd',
             )
             # TODO: fix the diff between cpu and gpu grad is large in original op
             # now use larger threshold when testing cpu grads to bypass cpu grad test
@@ -1832,7 +1832,7 @@ def test_jit_comp_with_cinn(self):
                 rev_actual[i],
                 rtol=rtol,  # mean of uniform distribution, scale for avoid random failed
                 atol=atol,
-                err_msg='%s jit_cinn rev' % self.places[i],
+                err_msg=f'{self.places[i]} jit_cinn rev',
             )
             i += 1
 
diff --git a/test/deprecated/legacy_test/test_gru_op.py b/test/legacy_test/test_gru_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_gru_op.py
rename to test/legacy_test/test_gru_op.py
diff --git a/test/deprecated/legacy_test/test_gru_unit_op.py b/test/legacy_test/test_gru_unit_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_gru_unit_op.py
rename to test/legacy_test/test_gru_unit_op.py
diff --git a/test/deprecated/legacy_test/test_gumbel_softmax_op.py b/test/legacy_test/test_gumbel_softmax_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_gumbel_softmax_op.py
rename to test/legacy_test/test_gumbel_softmax_op.py
diff --git a/test/deprecated/legacy_test/test_hinge_loss_op.py b/test/legacy_test/test_hinge_loss_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_hinge_loss_op.py
rename to test/legacy_test/test_hinge_loss_op.py
diff --git a/test/deprecated/legacy_test/test_huber_loss_op.py b/test/legacy_test/test_huber_loss_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_huber_loss_op.py
rename to test/legacy_test/test_huber_loss_op.py
diff --git a/test/deprecated/legacy_test/test_identity_loss_op.py b/test/legacy_test/test_identity_loss_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_identity_loss_op.py
rename to test/legacy_test/test_identity_loss_op.py
diff --git a/test/deprecated/legacy_test/test_im2sequence_op.py b/test/legacy_test/test_im2sequence_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_im2sequence_op.py
rename to test/legacy_test/test_im2sequence_op.py
diff --git a/test/legacy_test/test_imperative_deepcf.py b/test/legacy_test/test_imperative_deepcf.py
index 301ec4e0a468e..31e94078c7ca8 100644
--- a/test/legacy_test/test_imperative_deepcf.py
+++ b/test/legacy_test/test_imperative_deepcf.py
@@ -188,7 +188,7 @@ def get_data(self):
         )
 
     def load_data(self):
-        sys.stderr.write('loading from %s\n' % self.data_path)
+        sys.stderr.write(f'loading from {self.data_path}\n')
         likes = {}
         num_users = -1
         num_items = -1
@@ -299,7 +299,7 @@ def test_deefcf(self):
                         },
                         fetch_list=[loss],
                     )[0]
-                    sys.stderr.write('static loss %s\n' % static_loss)
+                    sys.stderr.write(f'static loss {static_loss}\n')
 
         with base.dygraph.guard():
             paddle.seed(seed)
diff --git a/test/deprecated/legacy_test/test_imperative_framework.py b/test/legacy_test/test_imperative_framework.py
similarity index 77%
rename from test/deprecated/legacy_test/test_imperative_framework.py
rename to test/legacy_test/test_imperative_framework.py
index 01f6d37eed4b1..b85eeb11df517 100644
--- a/test/deprecated/legacy_test/test_imperative_framework.py
+++ b/test/legacy_test/test_imperative_framework.py
@@ -15,7 +15,6 @@
 import unittest
 
 import numpy as np
-from test_imperative_base import new_program_scope
 
 import paddle
 from paddle import base
@@ -53,21 +52,13 @@ def forward(self, inputs):
 
 
 class TestDygraphFramework(unittest.TestCase):
-    def test_dygraph_backward(self):
-        with new_program_scope():
-            mlp = MLP(input_size=2)
-            var_inp = paddle.static.data("input", shape=[2, 2], dtype="float32")
-            out = mlp(var_inp)
-            try:
-                out.backward()
-                raise AssertionError(
-                    "backward should not be usable in static graph mode"
-                )
-            except AssertionError as e:
-                self.assertTrue(e is not None)
-
     def test_dygraph_to_string(self):
         np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
         with base.dygraph.guard():
             var_inp = paddle.to_tensor(np_inp)
             print(str(var_inp))
+
+
+if __name__ == '__main__':
+    paddle.disable_static()
+    unittest.main()
diff --git a/test/deprecated/legacy_test/test_imperative_star_gan_with_gradient_penalty.py b/test/legacy_test/test_imperative_star_gan_with_gradient_penalty.py
similarity index 100%
rename from test/deprecated/legacy_test/test_imperative_star_gan_with_gradient_penalty.py
rename to test/legacy_test/test_imperative_star_gan_with_gradient_penalty.py
diff --git a/test/deprecated/legacy_test/test_index_add_op.py b/test/legacy_test/test_index_add_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_index_add_op.py
rename to test/legacy_test/test_index_add_op.py
diff --git a/test/deprecated/legacy_test/test_index_sample_op.py b/test/legacy_test/test_index_sample_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_index_sample_op.py
rename to test/legacy_test/test_index_sample_op.py
diff --git a/test/deprecated/legacy_test/test_index_select_op.py b/test/legacy_test/test_index_select_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_index_select_op.py
rename to test/legacy_test/test_index_select_op.py
diff --git a/test/deprecated/legacy_test/test_input_spec.py b/test/legacy_test/test_input_spec.py
similarity index 96%
rename from test/deprecated/legacy_test/test_input_spec.py
rename to test/legacy_test/test_input_spec.py
index 8f86d002da306..aa649b58ca2a8 100644
--- a/test/deprecated/legacy_test/test_input_spec.py
+++ b/test/legacy_test/test_input_spec.py
@@ -35,9 +35,17 @@ def test_default(self):
         self.assertIsNone(tensor_spec.name)
 
     def test_from_tensor(self):
-        x_bool = paddle.tensor.fill_constant(
-            shape=[1], dtype='bool', value=True
-        )
+        if paddle.framework.use_pir_api():
+            x_bool = paddle.pir.core.create_parameter(
+                dtype='float32',
+                shape=[1],
+                name='xx',
+                initializer=paddle.nn.initializer.Uniform(),
+            )
+        else:
+            x_bool = paddle.tensor.fill_constant(
+                shape=[1], dtype='bool', value=True
+            )
         bool_spec = InputSpec.from_tensor(x_bool)
         self.assertEqual(bool_spec.dtype, x_bool.dtype)
         self.assertEqual(list(bool_spec.shape), list(x_bool.shape))
diff --git a/test/deprecated/legacy_test/test_instance_norm_op_v2.py b/test/legacy_test/test_instance_norm_op_v2.py
similarity index 100%
rename from test/deprecated/legacy_test/test_instance_norm_op_v2.py
rename to test/legacy_test/test_instance_norm_op_v2.py
diff --git a/test/deprecated/legacy_test/test_is_integer.py b/test/legacy_test/test_is_integer.py
similarity index 100%
rename from test/deprecated/legacy_test/test_is_integer.py
rename to test/legacy_test/test_is_integer.py
diff --git a/test/legacy_test/test_isin.py b/test/legacy_test/test_isin.py
new file mode 100644
index 0000000000000..101d89b4de84f
--- /dev/null
+++ b/test/legacy_test/test_isin.py
@@ -0,0 +1,327 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from op_test import convert_float_to_uint16
+
+import paddle
+from paddle import base
+from paddle.base import core
+from paddle.pir_utils import test_with_pir_api
+
+DATA_CASES = [
+    {'x_data': np.array(1.0), 'test_x_data': np.array(-1.0)},
+    {
+        'x_data': np.random.randint(-10, 10, (4, 8)),
+        'test_x_data': np.random.randint(0, 20, (2, 3)),
+    },
+    {
+        'x_data': np.random.randint(-50, 50, (8, 64)),
+        'test_x_data': np.random.randint(-20, 0, (4, 256)),
+    },
+]
+
+DATA_CASES_UNIQUE = [
+    {
+        'x_data': np.arange(0, 1000).reshape([2, 5, 100]),
+        'test_x_data': np.arange(200, 700),
+    },
+    {
+        'x_data': np.arange(-100, 100).reshape([2, 2, 5, 10]),
+        'test_x_data': np.arange(50, 150).reshape([4, 5, 5]),
+    },
+]
+
+DATA_CASES_BF16 = [
+    {'x_data': np.array(1.0), 'test_x_data': np.array(0.0)},
+    {
+        'x_data': np.random.randint(0, 10, (4, 8)),
+        'test_x_data': np.random.randint(5, 15, (2, 3)),
+    },
+    {
+        'x_data': np.random.randint(0, 50, (8, 64)),
+        'test_x_data': np.random.randint(0, 20, (4, 256)),
+    },
+]
+
+
+DATA_CASES_UNIQUE_BF16 = [
+    {
+        'x_data': np.arange(0, 100).reshape([2, 5, 10]),
+        'test_x_data': np.arange(50, 150),
+    },
+]
+
+
+DATA_TYPE = ['float32', 'float64', 'int32', 'int64']
+
+
+def run_dygraph(
+    x_data,
+    test_x_data,
+    type,
+    assume_unique=False,
+    invert=False,
+    use_gpu=False,
+):
+    place = paddle.CPUPlace()
+    if use_gpu and base.core.is_compiled_with_cuda():
+        place = paddle.CUDAPlace(0)
+    paddle.disable_static(place)
+    x_data = x_data.astype(type)
+    test_x_data = test_x_data.astype(type)
+    x_e = paddle.to_tensor(x_data)
+    x_t = paddle.to_tensor(test_x_data)
+    return paddle.isin(x_e, x_t, assume_unique, invert)
+
+
+def run_static(
+    x_data,
+    test_x_data,
+    type,
+    assume_unique=False,
+    invert=False,
+    use_gpu=False,
+):
+    paddle.enable_static()
+    startup_program = paddle.static.Program()
+    main_program = paddle.static.Program()
+    place = paddle.CPUPlace()
+    if use_gpu and base.core.is_compiled_with_cuda():
+        place = paddle.CUDAPlace(0)
+    exe = base.Executor(place)
+    with paddle.static.program_guard(main_program, startup_program):
+        x_data = x_data.astype(type)
+        test_x_data = test_x_data.astype(type)
+        x_e = paddle.static.data(name='x_e', shape=x_data.shape, dtype=type)
+        x_t = paddle.static.data(
+            name='x_t', shape=test_x_data.shape, dtype=type
+        )
+        res = paddle.isin(x_e, x_t, assume_unique, invert)
+        static_result = exe.run(
+            feed={'x_e': x_data, 'x_t': test_x_data},
+            fetch_list=[res],
+        )
+        return static_result
+
+
+def test(
+    data_cases, type_cases, assume_unique=False, invert=False, use_gpu=False
+):
+    for type in type_cases:
+        for case in data_cases:
+            x_data = case['x_data']
+            test_x_data = case['test_x_data']
+            dygraph_result = run_dygraph(
+                x_data,
+                test_x_data,
+                type,
+                assume_unique,
+                invert,
+                use_gpu,
+            ).numpy()
+            np_result = np.isin(
+                x_data.astype(type),
+                test_x_data.astype(type),
+                assume_unique=assume_unique,
+                invert=invert,
+            )
+            np.testing.assert_equal(dygraph_result, np_result)
+
+            @test_with_pir_api
+            def test_static():
+                (static_result,) = run_static(
+                    x_data,
+                    test_x_data,
+                    type,
+                    assume_unique,
+                    invert,
+                    use_gpu,
+                )
+                np.testing.assert_equal(static_result, np_result)
+
+            test_static()
+
+
+def run_dygraph_bf16(
+    x_data,
+    test_x_data,
+    assume_unique=False,
+    invert=False,
+    use_gpu=False,
+):
+    place = paddle.CPUPlace()
+    if use_gpu and base.core.is_compiled_with_cuda():
+        place = paddle.CUDAPlace(0)
+    paddle.disable_static(place)
+    x_e = paddle.to_tensor(convert_float_to_uint16(x_data))
+    x_t = paddle.to_tensor(convert_float_to_uint16(test_x_data))
+    return paddle.isin(x_e, x_t, assume_unique, invert)
+
+
+def run_static_bf16(
+    x_data,
+    test_x_data,
+    assume_unique=False,
+    invert=False,
+    use_gpu=False,
+):
+    paddle.enable_static()
+    startup_program = paddle.static.Program()
+    main_program = paddle.static.Program()
+    place = paddle.CPUPlace()
+    if use_gpu and base.core.is_compiled_with_cuda():
+        place = paddle.CUDAPlace(0)
+    exe = base.Executor(place)
+    with paddle.static.program_guard(main_program, startup_program):
+        x_data = convert_float_to_uint16(x_data)
+        test_x_data = convert_float_to_uint16(test_x_data)
+        x_e = paddle.static.data(
+            name='x_e', shape=x_data.shape, dtype=np.uint16
+        )
+        x_t = paddle.static.data(
+            name='x_t', shape=test_x_data.shape, dtype=np.uint16
+        )
+        res = paddle.isin(x_e, x_t, assume_unique, invert)
+        static_result = exe.run(
+            feed={'x_e': x_data, 'x_t': test_x_data},
+            fetch_list=[res],
+        )
+        return static_result
+
+
+def test_bf16(data_cases, assume_unique=False, invert=False, use_gpu=False):
+    for case in data_cases:
+        x_data = case['x_data'].astype("float32")
+        test_x_data = case['test_x_data'].astype("float32")
+        dygraph_result = run_dygraph_bf16(
+            x_data,
+            test_x_data,
+            assume_unique,
+            invert,
+            use_gpu,
+        ).numpy()
+        np_result = np.isin(
+            x_data,
+            test_x_data,
+            assume_unique=assume_unique,
+            invert=invert,
+        )
+        np.testing.assert_equal(dygraph_result, np_result)
+
+        @test_with_pir_api
+        def test_static():
+            (static_result,) = run_static_bf16(
+                x_data,
+                test_x_data,
+                assume_unique,
+                invert,
+                use_gpu,
+            )
+            np.testing.assert_equal(static_result, np_result)
+
+        test_static()
+
+
+class TestIsInError(unittest.TestCase):
+    def test_for_exception(self):
+        with self.assertRaises(TypeError):
+            paddle.isin(np.array([1, 2]), np.array([1, 2]))
+
+
+class TestIsIn(unittest.TestCase):
+    def test_without_gpu(self):
+        test(DATA_CASES, DATA_TYPE)
+
+    def test_with_gpu(self):
+        test(DATA_CASES, DATA_TYPE, use_gpu=True)
+
+    def test_invert_without_gpu(self):
+        test(DATA_CASES, DATA_TYPE, invert=True)
+
+    def test_invert_with_gpu(self):
+        test(DATA_CASES, DATA_TYPE, invert=True, use_gpu=True)
+
+    def test_unique_without_gpu(self):
+        test(DATA_CASES_UNIQUE, DATA_TYPE, assume_unique=True)
+
+    def test_unique_with_gpu(self):
+        test(DATA_CASES_UNIQUE, DATA_TYPE, assume_unique=True, use_gpu=True)
+
+    def test_unique_invert_without_gpu(self):
+        test(DATA_CASES_UNIQUE, DATA_TYPE, assume_unique=True, invert=True)
+
+    def test_unique_invert_with_gpu(self):
+        test(
+            DATA_CASES_UNIQUE,
+            DATA_TYPE,
+            assume_unique=True,
+            invert=True,
+            use_gpu=True,
+        )
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_float16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and not support the float16",
+)
+class TestIsInFP16(unittest.TestCase):
+    def test_default(self):
+        test(DATA_CASES, ['float16'], use_gpu=True)
+
+    def test_invert(self):
+        test(DATA_CASES, ['float16'], invert=True, use_gpu=True)
+
+    def test_unique(self):
+        test(DATA_CASES_UNIQUE, ['float16'], assume_unique=True, use_gpu=True)
+
+    def test_unique_invert(self):
+        test(
+            DATA_CASES_UNIQUE,
+            ['float16'],
+            assume_unique=True,
+            invert=True,
+            use_gpu=True,
+        )
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_float16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and not support the float16",
+)
+class TestIsInBF16(unittest.TestCase):
+    def test_default(self):
+        test_bf16(DATA_CASES_BF16, use_gpu=True)
+
+    def test_invert(self):
+        test_bf16(DATA_CASES_BF16, invert=True, use_gpu=True)
+
+    def test_unique(self):
+        test_bf16(DATA_CASES_UNIQUE_BF16, assume_unique=True, use_gpu=True)
+
+    def test_unique_invert(self):
+        test_bf16(
+            DATA_CASES_UNIQUE_BF16,
+            assume_unique=True,
+            invert=True,
+            use_gpu=True,
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/legacy_test/test_jit_save_load.py b/test/legacy_test/test_jit_save_load.py
index 09f5a7b9a4e4b..04b86c6864685 100644
--- a/test/legacy_test/test_jit_save_load.py
+++ b/test/legacy_test/test_jit_save_load.py
@@ -329,7 +329,6 @@ def train(layer, input_size=784, label_size=1):
     for data in train_loader():
         img, label = data
         label.stop_gradient = True
-
         cost = layer(img)
 
         loss = paddle.nn.functional.cross_entropy(
@@ -396,6 +395,8 @@ def train_and_save_model(self, model_path=None):
     @test_with_dygraph_pir
     def test_save_load(self):
         # train and save model
+        if not paddle.framework.use_pir_api():
+            return
         train_layer = self.train_and_save_model()
         # load model
         loaded_layer = paddle.jit.load(self.model_path)
@@ -496,6 +497,7 @@ def setUp(self):
     def tearDown(self):
         self.temp_dir.cleanup()
 
+    @test_with_dygraph_pir
     def test_output_same_order(self):
         x = paddle.to_tensor(np.random.random((4, 8)).astype('float32'))
 
@@ -1712,6 +1714,7 @@ def setUp(self):
     def tearDown(self):
         self.temp_dir.cleanup()
 
+    @test_with_dygraph_pir
     def test_save_load_finetune_load(self):
         model_path = os.path.join(
             self.temp_dir.name, "test_jit_save_load_save_without_running/model"
@@ -1788,7 +1791,6 @@ def forward(self, x):
         return y
 
 
-'''
 class TestJitSaveLoadFinetuneLoad(unittest.TestCase):
     def setUp(self):
         # enable dygraph mode
@@ -1798,8 +1800,10 @@ def setUp(self):
     def tearDown(self):
         self.temp_dir.cleanup()
 
-    #@test_with_dygraph_pir
+    @test_with_dygraph_pir
     def test_save_load_finetune_load(self):
+        if not paddle.framework.use_pir_api():
+            return
         model_path = os.path.join(
             self.temp_dir.name, "test_jit_save_load_finetune_load/model"
         )
@@ -1830,7 +1834,6 @@ def test_save_load_finetune_load(self):
 
         self.assertTrue(float((result_00 - result_10).abs().max()) < 1e-5)
         self.assertTrue(float((result_01 - result_11).abs().max()) < 1e-5)
-'''
 
 
 # NOTE(weixin): When there are multiple test functions in an
diff --git a/test/deprecated/legacy_test/test_kldiv_loss_op.py b/test/legacy_test/test_kldiv_loss_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_kldiv_loss_op.py
rename to test/legacy_test/test_kldiv_loss_op.py
diff --git a/test/deprecated/legacy_test/test_kron_op.py b/test/legacy_test/test_kron_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_kron_op.py
rename to test/legacy_test/test_kron_op.py
diff --git a/test/deprecated/legacy_test/test_kthvalue_op.py b/test/legacy_test/test_kthvalue_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_kthvalue_op.py
rename to test/legacy_test/test_kthvalue_op.py
diff --git a/test/deprecated/legacy_test/test_l1_norm_op.py b/test/legacy_test/test_l1_norm_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_l1_norm_op.py
rename to test/legacy_test/test_l1_norm_op.py
diff --git a/test/deprecated/legacy_test/test_label_smooth_op.py b/test/legacy_test/test_label_smooth_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_label_smooth_op.py
rename to test/legacy_test/test_label_smooth_op.py
diff --git a/test/deprecated/legacy_test/test_lerp_op.py b/test/legacy_test/test_lerp_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_lerp_op.py
rename to test/legacy_test/test_lerp_op.py
diff --git a/test/deprecated/legacy_test/test_lgamma_op.py b/test/legacy_test/test_lgamma_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_lgamma_op.py
rename to test/legacy_test/test_lgamma_op.py
diff --git a/test/deprecated/legacy_test/test_linear_interp_op.py b/test/legacy_test/test_linear_interp_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_linear_interp_op.py
rename to test/legacy_test/test_linear_interp_op.py
diff --git a/test/deprecated/legacy_test/test_linear_interp_v2_op.py b/test/legacy_test/test_linear_interp_v2_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_linear_interp_v2_op.py
rename to test/legacy_test/test_linear_interp_v2_op.py
diff --git a/test/deprecated/legacy_test/test_load_state_dict_from_old_format.py b/test/legacy_test/test_load_state_dict_from_old_format.py
similarity index 100%
rename from test/deprecated/legacy_test/test_load_state_dict_from_old_format.py
rename to test/legacy_test/test_load_state_dict_from_old_format.py
diff --git a/test/deprecated/legacy_test/test_log_loss_op.py b/test/legacy_test/test_log_loss_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_log_loss_op.py
rename to test/legacy_test/test_log_loss_op.py
diff --git a/test/deprecated/legacy_test/test_log_softmax.py b/test/legacy_test/test_log_softmax.py
similarity index 100%
rename from test/deprecated/legacy_test/test_log_softmax.py
rename to test/legacy_test/test_log_softmax.py
diff --git a/test/deprecated/legacy_test/test_logsumexp.py b/test/legacy_test/test_logsumexp.py
similarity index 100%
rename from test/deprecated/legacy_test/test_logsumexp.py
rename to test/legacy_test/test_logsumexp.py
diff --git a/test/deprecated/legacy_test/test_lr_scheduler.py b/test/legacy_test/test_lr_scheduler.py
similarity index 100%
rename from test/deprecated/legacy_test/test_lr_scheduler.py
rename to test/legacy_test/test_lr_scheduler.py
diff --git a/test/deprecated/legacy_test/test_lrn_op.py b/test/legacy_test/test_lrn_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_lrn_op.py
rename to test/legacy_test/test_lrn_op.py
diff --git a/test/legacy_test/test_lstm_cudnn_op.py b/test/legacy_test/test_lstm_cudnn_op.py
index ade1f61c0d5a9..3362297747b63 100644
--- a/test/legacy_test/test_lstm_cudnn_op.py
+++ b/test/legacy_test/test_lstm_cudnn_op.py
@@ -35,7 +35,7 @@ class RandomWeight:
     def __init__(self):
         pass
 
-    def updata_weight(self, hidden_size, input_size, dtype):
+    def update_weight(self, hidden_size, input_size, dtype):
         std = 1.0 / math.sqrt(hidden_size)
         self.hidden_size = hidden_size
         self.input_size = input_size
@@ -432,7 +432,7 @@ def setUp(self):
         input[9][3:][:] = 0
         input[8][4:][:] = 0
 
-        weight.updata_weight(hidden_size, input_size, self.dtype)
+        weight.update_weight(hidden_size, input_size, self.dtype)
         rnn1 = LSTM(
             input_size,
             hidden_size,
diff --git a/test/deprecated/legacy_test/test_lstm_op.py b/test/legacy_test/test_lstm_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_lstm_op.py
rename to test/legacy_test/test_lstm_op.py
diff --git a/test/deprecated/legacy_test/test_lu_unpack_op.py b/test/legacy_test/test_lu_unpack_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_lu_unpack_op.py
rename to test/legacy_test/test_lu_unpack_op.py
diff --git a/test/deprecated/legacy_test/test_masked_scatter.py b/test/legacy_test/test_masked_scatter.py
similarity index 100%
rename from test/deprecated/legacy_test/test_masked_scatter.py
rename to test/legacy_test/test_masked_scatter.py
diff --git a/test/deprecated/legacy_test/test_matmul_op.py b/test/legacy_test/test_matmul_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_matmul_op.py
rename to test/legacy_test/test_matmul_op.py
diff --git a/test/deprecated/legacy_test/test_matmul_v2_op.py b/test/legacy_test/test_matmul_v2_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_matmul_v2_op.py
rename to test/legacy_test/test_matmul_v2_op.py
diff --git a/test/deprecated/legacy_test/test_maxout_op.py b/test/legacy_test/test_maxout_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_maxout_op.py
rename to test/legacy_test/test_maxout_op.py
diff --git a/test/deprecated/legacy_test/test_meshgrid_op.py b/test/legacy_test/test_meshgrid_op.py
similarity index 80%
rename from test/deprecated/legacy_test/test_meshgrid_op.py
rename to test/legacy_test/test_meshgrid_op.py
index b72f51cd04144..869e2c4e88281 100644
--- a/test/deprecated/legacy_test/test_meshgrid_op.py
+++ b/test/legacy_test/test_meshgrid_op.py
@@ -42,16 +42,28 @@ def init_data_type(self):
         self.dtype = np.float64
 
     def test_check_output(self):
-        self.check_output(check_prim=True, check_pir=True, check_prim_pir=True)
+        if self.dtype == np.complex64 or self.dtype == np.complex128:
+            self.check_output(check_pir=True)
+        else:
+            self.check_output(
+                check_prim=True, check_pir=True, check_prim_pir=True
+            )
 
     def test_check_grad(self):
-        self.check_grad(
-            ['x0'],
-            ['out0', 'out1'],
-            check_prim=True,
-            check_pir=True,
-            check_prim_pir=True,
-        )
+        if self.dtype == np.complex64 or self.dtype == np.complex128:
+            self.check_grad(
+                ['x0'],
+                ['out0', 'out1'],
+                check_pir=True,
+            )
+        else:
+            self.check_grad(
+                ['x0'],
+                ['out0', 'out1'],
+                check_prim=True,
+                check_pir=True,
+                check_prim_pir=True,
+            )
 
     def init_inputs_and_outputs(self):
         self.shape = self.get_x_shape()
@@ -91,6 +103,22 @@ def init_data_type(self):
         self.dtype = np.float16
 
 
+class TestMeshgridOp2Complex64(TestMeshgridOp):
+    def get_x_shape(self):
+        return [100, 300]
+
+    def init_data_type(self):
+        self.dtype = np.complex64
+
+
+class TestMeshgridOp2Complex128(TestMeshgridOp):
+    def get_x_shape(self):
+        return [100, 300]
+
+    def init_data_type(self):
+        self.dtype = np.complex128
+
+
 @unittest.skipIf(
     not core.is_compiled_with_cuda()
     or not core.is_bfloat16_supported(core.CUDAPlace(0)),
@@ -336,6 +364,70 @@ def test_api_with_dygraph_tuple_input(self):
             np.testing.assert_array_equal(res_4.shape, [100, 200])
 
 
+class TestMeshgridOpComplexStatic(unittest.TestCase):
+    @test_with_pir_api
+    def test_tuple_input(self):
+        input_1 = np.random.randint(
+            0,
+            100,
+            [
+                100,
+            ],
+        ).astype('complex64')
+        input_2 = np.random.randint(
+            0,
+            100,
+            [
+                200,
+            ],
+        ).astype('complex64')
+
+        out_1 = np.reshape(input_1, [100, 1])
+        out_1 = np.broadcast_to(out_1, [100, 200])
+        out_2 = np.reshape(input_2, [1, 200])
+        out_2 = np.broadcast_to(out_2, [100, 200])
+
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.static.data(shape=[100], dtype='complex64', name='x')
+            y = paddle.static.data(shape=[200], dtype='complex64', name='y')
+
+            exe = base.Executor(place=base.CPUPlace())
+            grid_x, grid_y = paddle.tensor.meshgrid((x, y))
+            res_1, res_2 = exe.run(
+                paddle.static.default_main_program(),
+                feed={'x': input_1, 'y': input_2},
+                fetch_list=[grid_x, grid_y],
+            )
+        np.testing.assert_array_equal(res_1, out_1)
+        np.testing.assert_array_equal(res_2, out_2)
+
+
+class TestMeshgridOpComplexDygraph(unittest.TestCase):
+    def test_api_with_dygraph_tuple_input(self):
+        input_3 = np.random.randint(
+            0,
+            100,
+            [
+                100,
+            ],
+        ).astype('complex64')
+        input_4 = np.random.randint(
+            0,
+            100,
+            [
+                200,
+            ],
+        ).astype('complex64')
+
+        with base.dygraph.guard():
+            tensor_3 = paddle.to_tensor(input_3)
+            tensor_4 = paddle.to_tensor(input_4)
+            res_3, res_4 = paddle.tensor.meshgrid((tensor_3, tensor_4))
+
+            np.testing.assert_array_equal(res_3.shape, [100, 200])
+            np.testing.assert_array_equal(res_4.shape, [100, 200])
+
+
 class TestMeshGrid_ZeroDim(TestMeshgridOp):
     def init_inputs_and_outputs(self):
         self.shape = self.get_x_shape()
diff --git a/test/deprecated/legacy_test/test_modified_huber_loss_op.py b/test/legacy_test/test_modified_huber_loss_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_modified_huber_loss_op.py
rename to test/legacy_test/test_modified_huber_loss_op.py
diff --git a/test/deprecated/legacy_test/test_mul_op.py b/test/legacy_test/test_mul_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_mul_op.py
rename to test/legacy_test/test_mul_op.py
diff --git a/test/deprecated/legacy_test/test_multi_dot_op.py b/test/legacy_test/test_multi_dot_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_multi_dot_op.py
rename to test/legacy_test/test_multi_dot_op.py
diff --git a/test/deprecated/legacy_test/test_mv_op.py b/test/legacy_test/test_mv_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_mv_op.py
rename to test/legacy_test/test_mv_op.py
diff --git a/test/deprecated/legacy_test/test_nearest_interp_op.py b/test/legacy_test/test_nearest_interp_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_nearest_interp_op.py
rename to test/legacy_test/test_nearest_interp_op.py
diff --git a/test/deprecated/legacy_test/test_nearest_interp_v2_op.py b/test/legacy_test/test_nearest_interp_v2_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_nearest_interp_v2_op.py
rename to test/legacy_test/test_nearest_interp_v2_op.py
diff --git a/test/deprecated/legacy_test/test_ops_nms.py b/test/legacy_test/test_ops_nms.py
similarity index 100%
rename from test/deprecated/legacy_test/test_ops_nms.py
rename to test/legacy_test/test_ops_nms.py
diff --git a/test/deprecated/legacy_test/test_overlap_add_op.py b/test/legacy_test/test_overlap_add_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_overlap_add_op.py
rename to test/legacy_test/test_overlap_add_op.py
diff --git a/test/deprecated/legacy_test/test_pad3d_op.py b/test/legacy_test/test_pad3d_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_pad3d_op.py
rename to test/legacy_test/test_pad3d_op.py
diff --git a/test/deprecated/legacy_test/test_paddle_save_load_binary.py b/test/legacy_test/test_paddle_save_load_binary.py
similarity index 100%
rename from test/deprecated/legacy_test/test_paddle_save_load_binary.py
rename to test/legacy_test/test_paddle_save_load_binary.py
diff --git a/test/legacy_test/test_parallel_dygraph_dataparallel.py b/test/legacy_test/test_parallel_dygraph_dataparallel.py
index 648f6ddd97ef2..166687ce098e4 100644
--- a/test/legacy_test/test_parallel_dygraph_dataparallel.py
+++ b/test/legacy_test/test_parallel_dygraph_dataparallel.py
@@ -66,7 +66,7 @@ def start_local_trainers_cpu(
         proc_env = {
             "PADDLE_DISTRI_BACKEND": "gloo",
             "PADDLE_TRAINER_ID": "%d" % rank_id,
-            "PADDLE_CURRENT_ENDPOINT": "%s" % endpoint,
+            "PADDLE_CURRENT_ENDPOINT": f"{endpoint}",
             "PADDLE_TRAINERS_NUM": "%d" % n_rank,
             "PADDLE_TRAINER_ENDPOINTS": ",".join(trainer_endpoints),
         }
@@ -118,10 +118,11 @@ def start_local_trainers(
     procs = []
     for t in pod.trainers:
         proc_env = {
-            f"FLAGS_selected_{accelerator_type}s": "%s"
-            % ",".join([str(g) for g in t.gpus]),
+            f"FLAGS_selected_{accelerator_type}s": "{}".format(
+                ",".join([str(g) for g in t.gpus])
+            ),
             "PADDLE_TRAINER_ID": "%d" % t.rank,
-            "PADDLE_CURRENT_ENDPOINT": "%s" % t.endpoint,
+            "PADDLE_CURRENT_ENDPOINT": f"{t.endpoint}",
             "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
             "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()),
             "FLAGS_dynamic_static_unified_comm": "0",
diff --git a/test/legacy_test/test_parallel_dygraph_dataparallel_cpuonly.py b/test/legacy_test/test_parallel_dygraph_dataparallel_cpuonly.py
index 5a944284414bf..cd1b89e064d6e 100644
--- a/test/legacy_test/test_parallel_dygraph_dataparallel_cpuonly.py
+++ b/test/legacy_test/test_parallel_dygraph_dataparallel_cpuonly.py
@@ -66,7 +66,7 @@ def start_local_trainers(
     for t in pod.trainers:
         proc_env = {
             "PADDLE_TRAINER_ID": "%d" % t.rank,
-            "PADDLE_CURRENT_ENDPOINT": "%s" % t.endpoint,
+            "PADDLE_CURRENT_ENDPOINT": f"{t.endpoint}",
             "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
             "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()),
             "MASTER_ADDR": "127.0.0.1",
diff --git a/test/deprecated/legacy_test/test_partial_concat_op.py b/test/legacy_test/test_partial_concat_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_partial_concat_op.py
rename to test/legacy_test/test_partial_concat_op.py
diff --git a/test/deprecated/legacy_test/test_partial_sum_op.py b/test/legacy_test/test_partial_sum_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_partial_sum_op.py
rename to test/legacy_test/test_partial_sum_op.py
diff --git a/test/deprecated/legacy_test/test_pixel_shuffle_op.py b/test/legacy_test/test_pixel_shuffle_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_pixel_shuffle_op.py
rename to test/legacy_test/test_pixel_shuffle_op.py
diff --git a/test/deprecated/legacy_test/test_pool3d_op.py b/test/legacy_test/test_pool3d_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_pool3d_op.py
rename to test/legacy_test/test_pool3d_op.py
diff --git a/test/deprecated/legacy_test/test_put_along_axis_op.py b/test/legacy_test/test_put_along_axis_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_put_along_axis_op.py
rename to test/legacy_test/test_put_along_axis_op.py
diff --git a/test/deprecated/legacy_test/test_qr_op.py b/test/legacy_test/test_qr_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_qr_op.py
rename to test/legacy_test/test_qr_op.py
diff --git a/test/deprecated/legacy_test/test_repeat_interleave_op.py b/test/legacy_test/test_repeat_interleave_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_repeat_interleave_op.py
rename to test/legacy_test/test_repeat_interleave_op.py
diff --git a/test/deprecated/legacy_test/test_reshape_op.py b/test/legacy_test/test_reshape_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_reshape_op.py
rename to test/legacy_test/test_reshape_op.py
diff --git a/test/deprecated/legacy_test/test_reverse_op.py b/test/legacy_test/test_reverse_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_reverse_op.py
rename to test/legacy_test/test_reverse_op.py
diff --git a/test/deprecated/legacy_test/test_roi_align_op.py b/test/legacy_test/test_roi_align_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_roi_align_op.py
rename to test/legacy_test/test_roi_align_op.py
diff --git a/test/deprecated/legacy_test/test_roi_pool_op.py b/test/legacy_test/test_roi_pool_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_roi_pool_op.py
rename to test/legacy_test/test_roi_pool_op.py
diff --git a/test/deprecated/legacy_test/test_roll_op.py b/test/legacy_test/test_roll_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_roll_op.py
rename to test/legacy_test/test_roll_op.py
diff --git a/test/deprecated/legacy_test/test_row_conv_op.py b/test/legacy_test/test_row_conv_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_row_conv_op.py
rename to test/legacy_test/test_row_conv_op.py
diff --git a/test/deprecated/legacy_test/test_save_inference_model_conditional_op.py b/test/legacy_test/test_save_inference_model_conditional_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_save_inference_model_conditional_op.py
rename to test/legacy_test/test_save_inference_model_conditional_op.py
diff --git a/test/deprecated/legacy_test/test_save_model_without_var.py b/test/legacy_test/test_save_model_without_var.py
similarity index 100%
rename from test/deprecated/legacy_test/test_save_model_without_var.py
rename to test/legacy_test/test_save_model_without_var.py
diff --git a/test/deprecated/legacy_test/test_scatter_op.py b/test/legacy_test/test_scatter_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_scatter_op.py
rename to test/legacy_test/test_scatter_op.py
diff --git a/test/deprecated/legacy_test/test_selu_op.py b/test/legacy_test/test_selu_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_selu_op.py
rename to test/legacy_test/test_selu_op.py
diff --git a/test/deprecated/legacy_test/test_shuffle_channel_op.py b/test/legacy_test/test_shuffle_channel_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_shuffle_channel_op.py
rename to test/legacy_test/test_shuffle_channel_op.py
diff --git a/test/deprecated/legacy_test/test_sigmoid_cross_entropy_with_logits_grad_with_auto_grad.py b/test/legacy_test/test_sigmoid_cross_entropy_with_logits_grad_with_auto_grad.py
similarity index 100%
rename from test/deprecated/legacy_test/test_sigmoid_cross_entropy_with_logits_grad_with_auto_grad.py
rename to test/legacy_test/test_sigmoid_cross_entropy_with_logits_grad_with_auto_grad.py
diff --git a/test/deprecated/legacy_test/test_sign_op.py b/test/legacy_test/test_sign_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_sign_op.py
rename to test/legacy_test/test_sign_op.py
diff --git a/test/deprecated/legacy_test/test_solve_op.py b/test/legacy_test/test_solve_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_solve_op.py
rename to test/legacy_test/test_solve_op.py
diff --git a/test/deprecated/legacy_test/test_spectral_norm_op.py b/test/legacy_test/test_spectral_norm_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_spectral_norm_op.py
rename to test/legacy_test/test_spectral_norm_op.py
diff --git a/test/deprecated/legacy_test/test_split_op.py b/test/legacy_test/test_split_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_split_op.py
rename to test/legacy_test/test_split_op.py
diff --git a/test/deprecated/legacy_test/test_static_save_load_large.py b/test/legacy_test/test_static_save_load_large.py
similarity index 100%
rename from test/deprecated/legacy_test/test_static_save_load_large.py
rename to test/legacy_test/test_static_save_load_large.py
diff --git a/test/deprecated/legacy_test/test_stft_op.py b/test/legacy_test/test_stft_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_stft_op.py
rename to test/legacy_test/test_stft_op.py
diff --git a/test/deprecated/legacy_test/test_svd_op.py b/test/legacy_test/test_svd_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_svd_op.py
rename to test/legacy_test/test_svd_op.py
diff --git a/test/deprecated/legacy_test/test_swiglu.py b/test/legacy_test/test_swiglu.py
similarity index 100%
rename from test/deprecated/legacy_test/test_swiglu.py
rename to test/legacy_test/test_swiglu.py
diff --git a/test/deprecated/legacy_test/test_temporal_shift_op.py b/test/legacy_test/test_temporal_shift_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_temporal_shift_op.py
rename to test/legacy_test/test_temporal_shift_op.py
diff --git a/test/deprecated/legacy_test/test_top_k_op.py b/test/legacy_test/test_top_k_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_top_k_op.py
rename to test/legacy_test/test_top_k_op.py
diff --git a/test/deprecated/legacy_test/test_top_k_v2_op.py b/test/legacy_test/test_top_k_v2_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_top_k_v2_op.py
rename to test/legacy_test/test_top_k_v2_op.py
diff --git a/test/deprecated/legacy_test/test_trace_op.py b/test/legacy_test/test_trace_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_trace_op.py
rename to test/legacy_test/test_trace_op.py
diff --git a/test/deprecated/legacy_test/test_triangular_solve_op.py b/test/legacy_test/test_triangular_solve_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_triangular_solve_op.py
rename to test/legacy_test/test_triangular_solve_op.py
diff --git a/test/deprecated/legacy_test/test_trilinear_interp_op.py b/test/legacy_test/test_trilinear_interp_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_trilinear_interp_op.py
rename to test/legacy_test/test_trilinear_interp_op.py
diff --git a/test/deprecated/legacy_test/test_trilinear_interp_v2_op.py b/test/legacy_test/test_trilinear_interp_v2_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_trilinear_interp_v2_op.py
rename to test/legacy_test/test_trilinear_interp_v2_op.py
diff --git a/test/deprecated/legacy_test/test_trunc_op.py b/test/legacy_test/test_trunc_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_trunc_op.py
rename to test/legacy_test/test_trunc_op.py
diff --git a/test/deprecated/legacy_test/test_unfold_op.py b/test/legacy_test/test_unfold_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_unfold_op.py
rename to test/legacy_test/test_unfold_op.py
diff --git a/test/deprecated/legacy_test/test_unique_consecutive_op.py b/test/legacy_test/test_unique_consecutive_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_unique_consecutive_op.py
rename to test/legacy_test/test_unique_consecutive_op.py
diff --git a/test/deprecated/legacy_test/test_unpool3d_op.py b/test/legacy_test/test_unpool3d_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_unpool3d_op.py
rename to test/legacy_test/test_unpool3d_op.py
diff --git a/test/deprecated/legacy_test/test_unpool_op.py b/test/legacy_test/test_unpool_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_unpool_op.py
rename to test/legacy_test/test_unpool_op.py
diff --git a/test/deprecated/legacy_test/test_unstack_op.py b/test/legacy_test/test_unstack_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_unstack_op.py
rename to test/legacy_test/test_unstack_op.py
diff --git a/test/deprecated/legacy_test/test_yolov3_loss_op.py b/test/legacy_test/test_yolov3_loss_op.py
similarity index 100%
rename from test/deprecated/legacy_test/test_yolov3_loss_op.py
rename to test/legacy_test/test_yolov3_loss_op.py
diff --git a/test/prim/pir_prim/CMakeLists.txt b/test/prim/pir_prim/CMakeLists.txt
index 108cc3b8b28da..300cd1948f6b8 100644
--- a/test/prim/pir_prim/CMakeLists.txt
+++ b/test/prim/pir_prim/CMakeLists.txt
@@ -12,7 +12,8 @@ set(TEST_PRIM_PURE_PIR_CASES
     test_auto_recompute
     test_auto_recompute_dy2static
     test_prim_sub_graph_dynamic_shape
-    test_decompose_control_flow)
+    test_decompose_control_flow
+    test_decomp_whole_program)
 
 foreach(target ${TEST_PRIM_PURE_PIR_CASES})
   py_test_modules(
@@ -52,6 +53,7 @@ if(WITH_CINN)
       FLAGS_prim_check_ops=true
       FLAGS_enable_pir_api=true
       FLAGS_prim_enable_dynamic=true
+      FLAGS_prim_vjp_skip_default_ops=false
       FLAGS_cinn_bucket_compile=True
       FLAGS_pir_apply_shape_optimization_pass=1)
     set_tests_properties(${target} PROPERTIES LABELS "RUN_TYPE=CINN")
diff --git a/test/prim/pir_prim/test_decomp_whole_program.py b/test/prim/pir_prim/test_decomp_whole_program.py
index f8c58ef7c2469..7d0b28edf5dad 100644
--- a/test/prim/pir_prim/test_decomp_whole_program.py
+++ b/test/prim/pir_prim/test_decomp_whole_program.py
@@ -40,7 +40,8 @@ def base_net(self, flag=None):
             y.stop_gradient = False
             x1 = paddle.sin(x)
             y1 = paddle.cos(y)
-            tmp1 = paddle.matmul(x1, y1)
+            y3 = paddle.matmul(x1, y1)
+            tmp1 = paddle.concat((x1, y1, y3))
             tmp2 = paddle.mean(tmp1)
             sum_out = paddle.sin(tmp2)
             gradients = grad(sum_out, (x, y))
@@ -54,17 +55,18 @@ def base_net(self, flag=None):
 
         whole_ops = [op.name() for op in main_program.global_block().ops]
         if flag == "prim":
-            assert 'pd_op.matmul_grad' not in whole_ops
+            assert 'pd_op.concat_grad' not in whole_ops
         else:
-            assert 'pd_op.matmul_grad' in whole_ops
+            assert 'pd_op.concat_grad' in whole_ops
 
         return fwd, dx, dy
 
     def test_prim_all(self):
+        paddle.base.core._set_prim_backward_blacklist("sin_grad", "cos_grad")
         res_ref = self.base_net()
         res = self.base_net("prim")
         for ref, actual in zip(res_ref, res):
-            np.testing.assert_allclose(ref, actual, rtol=1e-6)
+            np.testing.assert_allclose(ref, actual, rtol=1e-6, atol=1e-6)
 
 
 if __name__ == "__main__":
diff --git a/test/quantization/test_imperative_qat_lsq.py b/test/quantization/test_imperative_qat_lsq.py
index c71bd02c56bbc..bd16d309b249c 100644
--- a/test/quantization/test_imperative_qat_lsq.py
+++ b/test/quantization/test_imperative_qat_lsq.py
@@ -213,7 +213,7 @@ def func_qat(self):
             print('eval_acc_top1', eval_acc_top1)
         self.assertTrue(
             eval_acc_top1 > 0.9,
-            msg="The test acc {%f} is less than 0.9." % eval_acc_top1,
+            msg=f"The test acc {{{eval_acc_top1:f}}} is less than 0.9.",
         )
 
     def test_qat(self):
diff --git a/test/sot/test_sot_dynamic_shape.py b/test/sot/test_sot_dynamic_shape.py
index ceed37d64438a..12608d1c871e4 100644
--- a/test/sot/test_sot_dynamic_shape.py
+++ b/test/sot/test_sot_dynamic_shape.py
@@ -25,7 +25,7 @@
 from paddle.jit.sot.utils import with_allow_dynamic_shape_guard
 
 
-def foo(x):
+def dynamic_shape_input_func1(x):
     s = x.shape[0]
     return x + s
 
@@ -85,6 +85,20 @@ def test_dynamic_int_input_cache_hit_case3(self):
                 )
                 self.assertEqual(ctx.translate_count, i + 1)
 
+    def test_dynamic_shape_input_cache_hit_case1(self):
+        with with_allow_dynamic_shape_guard(
+            True
+        ), test_instruction_translator_cache_context() as ctx:
+            self.assert_results(
+                dynamic_shape_input_func1, paddle.randn([1, 4, 5])
+            )
+            self.assertEqual(ctx.translate_count, 1)
+            for i in range(2, 6):
+                self.assert_results(
+                    dynamic_shape_input_func1, paddle.randn([i, 4, 5])
+                )
+                self.assertEqual(ctx.translate_count, 2)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/standalone_executor/test_standalone_measure_real_op_cost.py b/test/standalone_executor/test_standalone_measure_real_op_cost.py
index 9825e16e91ee6..8ee254a427d8e 100644
--- a/test/standalone_executor/test_standalone_measure_real_op_cost.py
+++ b/test/standalone_executor/test_standalone_measure_real_op_cost.py
@@ -112,7 +112,7 @@ def _run_op_profiling(self, place, run_profiling=True):
         return loss_data
 
     def _compare_loss_between(self, loss_run1, loss_run2):
-        s1, s2 = '%.6f' % loss_run1, '%.6f' % loss_run2
+        s1, s2 = f'{loss_run1:.6f}', f'{loss_run2:.6f}'
         return s1 == s2
 
     def test_op_profiling_cuda0(self):
diff --git a/test/xpu/test_block_multihead_attention_op_xpu.py b/test/xpu/test_block_multihead_attention_op_xpu.py
new file mode 100644
index 0000000000000..07c624c86b209
--- /dev/null
+++ b/test/xpu/test_block_multihead_attention_op_xpu.py
@@ -0,0 +1,585 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle.incubate.nn.functional.block_multihead_attention import (
+    block_multihead_attention_xpu,
+)
+
+paddle.seed(2023)
+np.random.seed(2023)
+
+
+def create_attn_mask(
+    mask_type,
+    batch_size,
+    seq_lens,
+    pre_cache_length=0,
+):
+    max_seq_len = max(seq_lens)
+    mask = paddle.zeros(
+        [batch_size, 1, max_seq_len, max_seq_len + pre_cache_length],
+        dtype=mask_type,
+    )
+    mask[:, :, :, :pre_cache_length] = 1
+    for i in range(batch_size):
+        seq_len = seq_lens[i]
+        mask[i, 0, :seq_len, :seq_len] = (
+            paddle.tril(paddle.ones(shape=(seq_len, seq_len), dtype=mask_type))
+            - 1
+        ) * 1e4
+    return mask
+
+
+def naive_attention_impl(
+    query,
+    key,
+    value,
+    cache_k=None,
+    cache_v=None,
+    pre_cache_k=None,
+    pre_cache_v=None,
+    mask=None,
+    scale=1.0,
+    cache_k_dequant_scales=None,
+    cache_v_dequant_scales=None,
+    use_cachekv_int8="None",
+):
+    batch = query.shape[0]
+    heads = query.shape[1]
+    seq_len = query.shape[2]
+    head_dim = query.shape[3]
+    kv_head = key.shape[1]
+
+    key = key.reshape([batch, kv_head, 1, seq_len, head_dim])
+    key = paddle.tile(key, [1, 1, heads // kv_head, 1, 1])
+    key = key.reshape([batch, heads, seq_len, head_dim])
+
+    if use_cachekv_int8 == "dynamic":
+        unsqueeze_shape = [2, 3]
+    elif use_cachekv_int8 == "static":
+        unsqueeze_shape = [0, 2, 3]
+    if pre_cache_k is not None:
+        key = paddle.concat([pre_cache_k, key], axis=2)
+    if cache_k is not None:
+        if cache_k_dequant_scales is not None:
+            dequant_cache_k = (
+                (cache_k.astype('float32') - 128.0)
+                * cache_k_dequant_scales.unsqueeze(unsqueeze_shape)
+            ).astype(key.dtype)
+            key = paddle.concat([dequant_cache_k, key], axis=2)
+        else:
+            key = paddle.concat([cache_k, key], axis=2)
+
+    value = value.reshape([batch, kv_head, 1, seq_len, head_dim])
+    value = paddle.tile(value, [1, 1, heads // kv_head, 1, 1])
+    value = value.reshape([batch, heads, seq_len, head_dim])
+    if pre_cache_v is not None:
+        value = paddle.concat([pre_cache_v, value], axis=2)
+    if cache_v is not None:
+        if cache_v_dequant_scales is not None:
+            dequant_cache_v = (
+                (cache_v.astype('float32') - 128.0)
+                * cache_v_dequant_scales.unsqueeze(unsqueeze_shape)
+            ).astype(value.dtype)
+            value = paddle.concat([dequant_cache_v, value], axis=2)
+        else:
+            value = paddle.concat([cache_v, value], axis=2)
+
+    qk_res = paddle.matmul(query, key, transpose_y=True)
+    attention = qk_res * scale
+    if mask is not None:
+        attention = attention + mask
+    softmax_result = paddle.nn.functional.softmax(attention, -1)
+    result = paddle.matmul(softmax_result, value)
+    return result
+
+
+def get_padding_offset(bsz, max_seq_len, seq_lens_this_time):
+    cum_offsets_now = paddle.cumsum(max_seq_len - seq_lens_this_time)
+    cum_offsets = paddle.zeros(shape=(bsz + 1), dtype="int32")
+    cum_offsets[1:] = cum_offsets_now
+    token_num = paddle.sum(seq_lens_this_time)
+    padding_offsets = paddle.zeros(shape=(token_num), dtype="int32")
+    cu_seqlens_q = paddle.zeros(shape=(bsz + 1), dtype="int32")
+    cu_seqlens_k = paddle.zeros(shape=(bsz + 1), dtype="int32")
+    for i in range(bsz):
+        seq_len_now = seq_lens_this_time[i]
+        cum_offset = cum_offsets[i]
+        for j in range(seq_len_now):
+            padding_offsets[i * max_seq_len - cum_offset + j] = cum_offset
+        cum_seq_len = (i + 1) * max_seq_len - cum_offsets[i + 1]
+        cu_seqlens_q[i + 1] = cum_seq_len
+        cu_seqlens_k[i + 1] = cum_seq_len
+    return padding_offsets, cum_offsets[:-1], cu_seqlens_q, cu_seqlens_k
+
+
+class RopeEmbedding:
+    def _rotary_position_embedding(self, seq_len, head_dim, dtype):
+        pos_seq = paddle.arange(0, seq_len, 1, dtype=dtype)
+        indices = paddle.arange(0, head_dim, 2, dtype=dtype)
+        indices = 1 / 10000 ** (indices / head_dim)
+
+        sinusoid_inp = pos_seq.unsqueeze(1) * indices.unsqueeze(0)
+        pos_emb = paddle.concat(
+            [paddle.sin(sinusoid_inp), paddle.cos(sinusoid_inp)], axis=-1
+        )
+        pos_emb = paddle.reshape(pos_emb, (1, 1, seq_len, head_dim))
+        pos_emb.stop_gradient = True
+        return pos_emb
+
+    def _apply_rope(self, rp, q, k, v=None):
+        # sin [sequence_length, embed_size_per_head//2]
+        # cos [sequence_length, embed_size_per_head//2]
+        sin, cos = paddle.chunk(rp, 2, axis=-1)
+        # sin [θ0,θ1,θ2......θd/2-1] -> sin_pos [θ0,θ0,θ1,θ1,θ2,θ2......θd/2-1,θd/2-1]
+        sin_pos = paddle.reshape(paddle.stack([sin, sin], axis=-1), rp.shape)
+        # cos [θ0,θ1,θ2......θd/2-1] -> cos_pos [θ0,θ0,θ1,θ1,θ2,θ2......θd/2-1,θd/2-1]
+        cos_pos = paddle.reshape(paddle.stack([cos, cos], axis=-1), rp.shape)
+        # rotate_half_query_layer [-q1,q0,-q3,q2......,-qd-1,qd-2]
+        rotate_half_q = paddle.reshape(
+            paddle.stack([-q[:, :, :, 1::2], q[:, :, :, 0::2]], axis=-1),
+            paddle.shape(q),
+        )
+        query = paddle.add(
+            paddle.multiply(q, cos_pos), paddle.multiply(rotate_half_q, sin_pos)
+        )
+        # rotate_half_key_layer [-k1,k0,-k3,k2......,-kd-1,kd-2]
+        rotate_half_k = paddle.reshape(
+            paddle.stack([-k[:, :, :, 1::2], k[:, :, :, 0::2]], axis=-1),
+            paddle.shape(k),
+        )
+        key = paddle.add(
+            paddle.multiply(k, cos_pos), paddle.multiply(rotate_half_k, sin_pos)
+        )
+        if v is not None:
+            # rotate_half_value_layer [-v1,v0,-v3,v2......,-vd-1,vd-2]
+            rotate_half_v = paddle.reshape(
+                paddle.stack([-v[:, :, :, 1::2], v[:, :, :, 0::2]], axis=-1),
+                paddle.shape(v),
+            )
+            value = paddle.add(
+                paddle.multiply(v, cos_pos),
+                paddle.multiply(rotate_half_v, sin_pos),
+            )
+            return query, key, value
+        return query, key
+
+    def _apply_neox_rope(self, rp, q, k, v=None):
+        # sin [bs, sequence_length, embed_size_per_head//2]
+        # cos [bs, sequence_length, embed_size_per_head//2]
+        sin, cos = paddle.chunk(rp, 2, axis=-1)
+
+        # sin [θ0,θ1,θ2......θd/2-1] -> sin_pos [θ0,θ1,θ2......θd/2-1, θ0,θ1,θ2......θd/2-1]
+        sin_pos = paddle.concat([sin, sin], axis=-1).squeeze(0).unsqueeze(1)
+        # cos [θ0,θ1,θ2......θd/2-1] -> cos_pos [θ0,θ1,θ2......θd/2-1, θ0,θ1,θ2......θd/2-1]
+        cos_pos = paddle.concat([cos, cos], axis=-1).squeeze(0).unsqueeze(1)
+        rotate_half_q = paddle.reshape(
+            paddle.concat(
+                [-q[:, :, :, sin.shape[-1] :], q[:, :, :, 0 : sin.shape[-1]]],
+                axis=-1,
+            ),
+            paddle.shape(q),
+        )
+        query = paddle.add(
+            paddle.multiply(q, cos_pos), paddle.multiply(rotate_half_q, sin_pos)
+        )
+        rotate_half_k = paddle.reshape(
+            paddle.concat(
+                [-k[:, :, :, sin.shape[-1] :], k[:, :, :, 0 : sin.shape[-1]]],
+                axis=-1,
+            ),
+            paddle.shape(k),
+        )
+        key = paddle.add(
+            paddle.multiply(k, cos_pos), paddle.multiply(rotate_half_k, sin_pos)
+        )
+        if v is not None:
+            rotate_half_v = paddle.reshape(
+                paddle.concat(
+                    [
+                        -v[:, :, :, sin.shape[-1] :],
+                        v[:, :, :, 0 : sin.shape[-1]],
+                    ],
+                    axis=-1,
+                ),
+                paddle.shape(v),
+            )
+            value = paddle.add(
+                paddle.multiply(v, cos_pos),
+                paddle.multiply(rotate_half_v, sin_pos),
+            )
+            return query, key, value
+        return query, key
+
+
+def remove_padding(seq_lens, cu_seq_lens, inputs, token_num):
+    bsz, num_head, seq_len, dim_head = inputs.shape
+    output = paddle.zeros(
+        shape=[token_num, num_head * dim_head], dtype=inputs.dtype
+    )
+    inputs = inputs.transpose([0, 2, 1, 3]).reshape([bsz, seq_len, -1])
+    for i in range(bsz):
+        seq_len_now = seq_lens[i]
+        start_idx = cu_seq_lens[i]
+        end_idx = cu_seq_lens[i + 1]
+        output[start_idx:end_idx, :] = inputs[i, :seq_len_now, :]
+    return output
+
+
+def block_cache_to_naive_cache(
+    cache_k, cache_v, bsz, block_tables, cache_seq_len
+):
+    _, num_head, blocksize, dim_head = cache_k.shape
+    out_cache_k = paddle.zeros(
+        shape=[bsz, num_head, cache_seq_len, dim_head], dtype=cache_k.dtype
+    )
+    out_cache_v = paddle.zeros(
+        shape=[bsz, num_head, cache_seq_len, dim_head], dtype=cache_v.dtype
+    )
+    for i in range(bsz):
+        for j in range(cache_seq_len):
+            out_cache_k[i, :, j, :] = cache_k[
+                block_tables[i, j // blocksize], :, j % blocksize, :
+            ]
+            out_cache_v[i, :, j, :] = cache_v[
+                block_tables[i, j // blocksize], :, j % blocksize, :
+            ]
+    return out_cache_k, out_cache_v
+
+
+class TestBlockMultiHeadAttnRoPEXPU(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+        self.name = "TestBlockMultiHeadAttnRoPE"
+        self.place = paddle.XPUPlace(0)
+        self.batch_size = 2
+        self.num_head = 8
+        self.seq_len = 64
+        self.max_dec_len = 64
+        self.dim_head = 64
+        self.hid_dim = self.num_head * self.dim_head
+        self.blocksize = 64
+        self.block_num_per_seq = (
+            self.seq_len + self.max_dec_len + self.blocksize - 1
+        ) // self.blocksize
+        self.rope = RopeEmbedding()
+        self.max_block_num = self.block_num_per_seq * self.batch_size
+        self.free_list = list(range(self.max_block_num - 1, -1, -1))
+        self.seq_lens_encoder = paddle.to_tensor(
+            [
+                self.seq_len,
+            ]
+            * self.batch_size,
+            "int32",
+        )
+        self.seq_lens_decoder = paddle.to_tensor(
+            [
+                0,
+            ]
+            * self.batch_size,
+            "int32",
+        )
+        self.seq_lens_this_time = paddle.to_tensor(
+            [
+                self.seq_len,
+            ]
+            * self.batch_size,
+            "int32",
+        )
+        self.shape = (
+            self.batch_size,
+            self.num_head,
+            self.seq_len,
+            self.dim_head,
+        )
+        self.cache_shape = (
+            self.max_block_num,
+            self.num_head,
+            self.blocksize,
+            self.dim_head,
+        )
+        self.dtype = 'float16'
+        self.attention_mask = create_attn_mask(
+            self.dtype,
+            self.batch_size,
+            [
+                self.seq_len,
+            ]
+            * self.batch_size,
+        )
+        self.scale = 1.0 / np.sqrt(self.shape[-1])
+        self.cache_k = paddle.zeros(shape=self.cache_shape, dtype=self.dtype)
+        self.cache_v = paddle.zeros(shape=self.cache_shape, dtype=self.dtype)
+        self.block_tables = paddle.zeros(
+            shape=(self.batch_size, self.block_num_per_seq), dtype="int32"
+        )
+        self.cache_k_per_batch_maxs = paddle.zeros(
+            [self.batch_size, 6], dtype="float32"
+        )
+        self.cache_v_per_batch_maxs = paddle.zeros(
+            [self.batch_size, 6], dtype="float32"
+        )
+        for i in range(self.batch_size):
+            need_block_num = (
+                self.seq_len + self.max_dec_len + self.blocksize - 1
+            ) // self.blocksize
+            for j in range(need_block_num):
+                self.block_tables[i, j] = self.free_list.pop()
+        (
+            self.padding_offset,
+            self.cum_offset,
+            self.cu_seqlens_q,
+            self.cu_seqlens_k,
+        ) = get_padding_offset(
+            self.batch_size, self.seq_len, self.seq_lens_this_time
+        )
+        self.token_num = self.padding_offset.shape[0]
+
+    def get_rotary_position_embedding(self, position_ids, head_dim):
+        bsz, max_seq_len = position_ids.shape[:2]
+        rot_emb = paddle.zeros(
+            (2, bsz, max_seq_len, 1, head_dim), dtype="float32"
+        )
+        inv_freq = 10000 ** (
+            -paddle.arange(0, head_dim, 2, dtype="float32") / head_dim
+        )
+
+        # shape: [B, S, D/2]
+        freqs = paddle.einsum(
+            "ij,k->ijk", position_ids.cast("float32"), inv_freq
+        )
+        # shape: [B, S, D]
+        emb = paddle.concat([freqs, freqs], axis=-1).reshape(
+            (bsz, max_seq_len, head_dim)
+        )
+        # emb = paddle.stack([freqs], axis=-1).reshape(
+        #     (bsz, max_seq_len, head_dim // 2)
+        # )
+        # shape: [B, S, 1, D]
+        emb = paddle.unsqueeze(emb, 2)
+
+        rot_emb[0] = paddle.cos(emb)
+        rot_emb[1] = paddle.sin(emb)
+        return rot_emb
+
+    def test_all(self):
+        paddle.disable_static()
+        tmp_position_ids = paddle.arange(
+            self.seq_len + self.max_dec_len
+        ).reshape((1, -1))
+        self.rope_emb = self.get_rotary_position_embedding(
+            tmp_position_ids, self.dim_head
+        )
+        # encoder
+        query = np.random.random(self.shape)
+        q = paddle.to_tensor(
+            query, place=self.place, dtype=self.dtype, stop_gradient=False
+        )
+        key = np.random.random(self.shape)
+        k = paddle.to_tensor(
+            key, place=self.place, dtype=self.dtype, stop_gradient=False
+        )
+        value = np.random.random(self.shape)
+        v = paddle.to_tensor(
+            value, place=self.place, dtype=self.dtype, stop_gradient=False
+        )
+        qkv = paddle.stack(
+            [
+                q.transpose([0, 2, 1, 3]).reshape(
+                    [self.token_num, self.hid_dim]
+                ),
+                k.transpose([0, 2, 1, 3]).reshape(
+                    [self.token_num, self.hid_dim]
+                ),
+                v.transpose([0, 2, 1, 3]).reshape(
+                    [self.token_num, self.hid_dim]
+                ),
+            ],
+            axis=1,
+        ).reshape([self.token_num, -1])
+        sinusoidal_pos = self.rope._rotary_position_embedding(
+            self.seq_len, self.dim_head, "float32"
+        )
+        q, k = self.rope._apply_neox_rope(
+            sinusoidal_pos.astype("float16"), q, k
+        )
+
+        out_ = naive_attention_impl(
+            q, k, v, None, None, None, None, self.attention_mask, self.scale
+        )
+        out_ = remove_padding(
+            self.seq_lens_this_time, self.cu_seqlens_q, out_, self.token_num
+        )
+        out = block_multihead_attention_xpu(
+            qkv,
+            self.cache_k,
+            self.cache_v,
+            self.seq_lens_encoder,
+            self.seq_lens_decoder,
+            self.seq_lens_this_time,
+            self.padding_offset,
+            self.cum_offset,
+            self.cu_seqlens_q,
+            self.cu_seqlens_k,
+            self.block_tables,
+            self.cache_k_per_batch_maxs,
+            self.cache_v_per_batch_maxs,
+            None,  # pre_key_cache
+            None,  # pre_value_cache
+            None,  # cache_k_quant_scales
+            None,  # cache_v_quant_scales
+            None,  # cache_k_dequant_scales
+            None,  # cache_v_dequant_scales
+            None,  # qkv_out_scale
+            None,  # qkv_bias
+            None,  # out_shift
+            None,  # out_smooth
+            None,  # max_enc_len_this_time
+            None,  # max_dec_len_this_time
+            self.rope_emb,  # rotary_embs
+            None,  # attn_mask
+            None,  # tgt_mask
+            self.seq_len,
+            self.blocksize,
+            True,  # use_neox_rotary_style
+        )[0]
+        np.testing.assert_allclose(
+            out.numpy(),
+            out_.numpy(),
+            rtol=5e-03,
+            atol=1e-03,
+        )
+
+        # decoder
+        naive_cache_k, naive_cache_v = block_cache_to_naive_cache(
+            self.cache_k,
+            self.cache_v,
+            self.batch_size,
+            self.block_tables,
+            self.seq_len,
+        )
+
+        self.seq_lens_decoder = self.seq_lens_encoder.clone()
+        self.seq_lens_encoder[:] = paddle.zeros_like(self.seq_lens_encoder)
+        self.seq_lens_this_time[:] = 1
+        self.shape = (
+            self.batch_size,
+            self.num_head,
+            1,
+            self.dim_head,
+        )
+        query = np.random.random(self.shape)
+        q = paddle.to_tensor(
+            query, place=self.place, dtype=self.dtype, stop_gradient=False
+        )
+        key = np.random.random(self.shape)
+        k = paddle.to_tensor(
+            key, place=self.place, dtype=self.dtype, stop_gradient=False
+        )
+        value = np.random.random(self.shape)
+        v = paddle.to_tensor(
+            value, place=self.place, dtype=self.dtype, stop_gradient=False
+        )
+
+        qkv = paddle.stack(
+            [
+                q.transpose([0, 2, 1, 3]).reshape(
+                    [self.batch_size, self.hid_dim]
+                ),
+                k.transpose([0, 2, 1, 3]).reshape(
+                    [self.batch_size, self.hid_dim]
+                ),
+                v.transpose([0, 2, 1, 3]).reshape(
+                    [self.batch_size, self.hid_dim]
+                ),
+            ],
+            axis=1,
+        ).reshape([self.batch_size, -1])
+
+        sinusoidal_pos = self.rope._rotary_position_embedding(
+            self.seq_len + 1, self.dim_head, "float32"
+        )[:, :, -1:, :]
+        q, k = self.rope._apply_neox_rope(
+            sinusoidal_pos.astype("float16"), q, k
+        )
+        (
+            self.padding_offset,
+            self.cum_offset,
+            self.cu_seqlens_q,
+            self.cu_seqlens_k,
+        ) = get_padding_offset(self.batch_size, 1, self.seq_lens_this_time)
+        out_ = (
+            naive_attention_impl(
+                q,
+                k,
+                v,
+                naive_cache_k,
+                naive_cache_v,
+                None,
+                None,
+                None,
+                self.scale,
+            )
+            .transpose([0, 2, 1, 3])
+            .reshape([self.batch_size, -1])
+        )
+        out = block_multihead_attention_xpu(
+            qkv,
+            self.cache_k,
+            self.cache_v,
+            self.seq_lens_encoder,
+            self.seq_lens_decoder,
+            self.seq_lens_this_time,
+            self.padding_offset,
+            self.cum_offset,
+            self.cu_seqlens_q,
+            self.cu_seqlens_k,
+            self.block_tables,
+            self.cache_k_per_batch_maxs,
+            self.cache_v_per_batch_maxs,
+            None,  # pre_key_cache
+            None,  # pre_value_cache
+            None,  # cache_k_quant_scales
+            None,  # cache_v_quant_scales
+            None,  # cache_k_dequant_scales
+            None,  # cache_v_dequant_scales
+            None,  # qkv_out_scale
+            None,  # qkv_bias
+            None,  # out_shift
+            None,  # out_smooth
+            None,  # max_enc_len_this_time
+            None,  # max_dec_len_this_time
+            self.rope_emb,  # rotary_embs
+            None,  # attn_mask
+            None,  # tgt_mask
+            1,  # seq_len,
+            self.blocksize,
+            True,  # use_neox_rotary_style
+        )[0]
+        # NOTE: The diff of decoder is a little big
+        np.testing.assert_allclose(
+            out.numpy(),
+            out_.numpy(),
+            rtol=5e-02,
+            atol=5e-02,
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/xpu/test_collective_api_base.py b/test/xpu/test_collective_api_base.py
index 0c3d710a06335..c94061d5fc6d1 100644
--- a/test/xpu/test_collective_api_base.py
+++ b/test/xpu/test_collective_api_base.py
@@ -202,7 +202,7 @@ def setUp(self):
         self._trainers = 2
         self._ps_endpoints = f"127.0.0.1:{self._find_free_port()},127.0.0.1:{self._find_free_port()}"
         self._python_interp = sys.executable
-        self._master_endpoints = "127.0.0.1:%s" % (self._find_free_port())
+        self._master_endpoints = f"127.0.0.1:{self._find_free_port()}"
 
         self.temp_dir = tempfile.TemporaryDirectory()
 
@@ -300,15 +300,15 @@ def _run_cluster(self, model_file, envs):
 
         tr0_out, tr0_err = tr0_proc.communicate()
         tr1_out, tr1_err = tr1_proc.communicate()
-        sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err)
-        sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err)
+        sys.stderr.write(f'trainer 0 stderr: {tr0_err}\n')
+        sys.stderr.write(f'trainer 1 stderr: {tr1_err}\n')
         # close trainer file
         tr0_pipe.close()
         tr1_pipe.close()
         with open(path0, "r") as f:
-            sys.stderr.write('trainer 0 stderr file: %s\n' % f.read())
+            sys.stderr.write(f'trainer 0 stderr file: {f.read()}\n')
         with open(path1, "r") as f:
-            sys.stderr.write('trainer 1 stderr file: %s\n' % f.read())
+            sys.stderr.write(f'trainer 1 stderr file: {f.read()}\n')
 
         def load_and_remove(path):
             with open(path, 'rb') as f:
diff --git a/test/xpu/test_collective_base_xpu.py b/test/xpu/test_collective_base_xpu.py
index 8a3289f0eb02a..c6cd081b498d7 100644
--- a/test/xpu/test_collective_base_xpu.py
+++ b/test/xpu/test_collective_base_xpu.py
@@ -244,8 +244,8 @@ def _run_cluster(self, model_file, envs):
 
         tr0_out, tr0_err = tr0_proc.communicate()
         tr1_out, tr1_err = tr1_proc.communicate()
-        sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err)
-        sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err)
+        sys.stderr.write(f'trainer 0 stderr: {tr0_err}\n')
+        sys.stderr.write(f'trainer 1 stderr: {tr1_err}\n')
         # close trainer file
         tr0_pipe.close()
         tr1_pipe.close()
diff --git a/test/xpu/test_conv2d_op_xpu.py b/test/xpu/test_conv2d_op_xpu.py
index df36f226408eb..4c7419ae9e5fd 100644
--- a/test/xpu/test_conv2d_op_xpu.py
+++ b/test/xpu/test_conv2d_op_xpu.py
@@ -36,14 +36,14 @@ def conv2d_forward_naive(
 ):
     if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]:
         raise ValueError(
-            "Unknown Attr(padding_algorithm): '%s'. "
-            "It can only be 'SAME' or 'VALID'." % str(padding_algorithm)
+            f"Unknown Attr(padding_algorithm): '{str(padding_algorithm)}'. "
+            "It can only be 'SAME' or 'VALID'."
         )
 
     if data_format not in ["NCHW", "NHWC"]:
         raise ValueError(
-            "Unknown Attr(data_format): '%s' ."
-            "It can only be 'NCHW' or 'NHWC'." % str(data_format)
+            f"Unknown Attr(data_format): '{str(data_format)}' ."
+            "It can only be 'NCHW' or 'NHWC'."
         )
 
     channel_last = data_format == "NHWC"
diff --git a/test/xpu/test_conv2d_transpose_op_xpu.py b/test/xpu/test_conv2d_transpose_op_xpu.py
index 57c564335fbc1..1728889827992 100644
--- a/test/xpu/test_conv2d_transpose_op_xpu.py
+++ b/test/xpu/test_conv2d_transpose_op_xpu.py
@@ -31,8 +31,8 @@ def conv2dtranspose_forward_naive(input_, filter_, attrs):
     padding_algorithm = attrs['padding_algorithm']
     if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]:
         raise ValueError(
-            "Unknown Attr(padding_algorithm): '%s'. "
-            "It can only be 'SAME' or 'VALID'." % str(padding_algorithm)
+            f"Unknown Attr(padding_algorithm): '{str(padding_algorithm)}'. "
+            "It can only be 'SAME' or 'VALID'."
         )
 
     if attrs['data_format'] == 'NHWC':
diff --git a/test/xpu/test_conv3d_op_xpu.py b/test/xpu/test_conv3d_op_xpu.py
index 021c57821c12d..26582b4e1b2c5 100644
--- a/test/xpu/test_conv3d_op_xpu.py
+++ b/test/xpu/test_conv3d_op_xpu.py
@@ -31,14 +31,14 @@ def conv3d_forward_naive(
 ):
     if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]:
         raise ValueError(
-            "Unknown Attr(padding_algorithm): '%s'. "
-            "It can only be 'SAME' or 'VALID'." % str(padding_algorithm)
+            f"Unknown Attr(padding_algorithm): '{str(padding_algorithm)}'. "
+            "It can only be 'SAME' or 'VALID'."
         )
 
     if data_format not in ["NCDHW", "NDHWC"]:
         raise ValueError(
-            "Unknown Attr(data_format): '%s' ."
-            "It can only be 'NCDHW' or 'NDHWC'." % str(data_format)
+            f"Unknown Attr(data_format): '{str(data_format)}' ."
+            "It can only be 'NCDHW' or 'NDHWC'."
         )
 
     channel_last = data_format == "NDHWC"
diff --git a/test/xpu/test_depthwise_conv2d_transpose_op_xpu.py b/test/xpu/test_depthwise_conv2d_transpose_op_xpu.py
index 96077ae8c83d0..878519fbd507d 100644
--- a/test/xpu/test_depthwise_conv2d_transpose_op_xpu.py
+++ b/test/xpu/test_depthwise_conv2d_transpose_op_xpu.py
@@ -31,8 +31,8 @@ def depthwiseconv2dtranspose_forward_naive(input_, filter_, attrs):
     padding_algorithm = attrs['padding_algorithm']
     if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]:
         raise ValueError(
-            "Unknown Attr(padding_algorithm): '%s'. "
-            "It can only be 'SAME' or 'VALID'." % str(padding_algorithm)
+            f"Unknown Attr(padding_algorithm): '{str(padding_algorithm)}'. "
+            "It can only be 'SAME' or 'VALID'."
         )
 
     if attrs['data_format'] == 'NHWC':
diff --git a/test/xpu/test_parallel_dygraph_dataparallel.py b/test/xpu/test_parallel_dygraph_dataparallel.py
index 0070f8ade9802..3eed21553b7a5 100644
--- a/test/xpu/test_parallel_dygraph_dataparallel.py
+++ b/test/xpu/test_parallel_dygraph_dataparallel.py
@@ -73,9 +73,11 @@ def start_local_trainers(
     for t in pod.trainers:
         proc_env = {
             "PADDLE_DISTRI_BACKEND": "bkcl",
-            "FLAGS_selected_xpus": "%s" % ",".join([str(g) for g in t.gpus]),
+            "FLAGS_selected_xpus": "{}".format(
+                ",".join([str(g) for g in t.gpus])
+            ),
             "PADDLE_TRAINER_ID": "%d" % t.rank,
-            "PADDLE_CURRENT_ENDPOINT": "%s" % t.endpoint,
+            "PADDLE_CURRENT_ENDPOINT": f"{t.endpoint}",
             "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
             "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()),
         }
diff --git a/test/xpu/test_pool2d_op_xpu.py b/test/xpu/test_pool2d_op_xpu.py
index f62ffb4fc45a6..1d3c1def63bfb 100644
--- a/test/xpu/test_pool2d_op_xpu.py
+++ b/test/xpu/test_pool2d_op_xpu.py
@@ -172,8 +172,8 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
         padding_algorithm = padding_algorithm.upper()
         if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]:
             raise ValueError(
-                "Unknown Attr(padding_algorithm): '%s'. "
-                "It can only be 'SAME' or 'VALID'." % str(padding_algorithm)
+                f"Unknown Attr(padding_algorithm): '{str(padding_algorithm)}'. "
+                "It can only be 'SAME' or 'VALID'."
             )
 
         if padding_algorithm == "VALID":
diff --git a/test/xpu/test_pool3d_op_xpu.py b/test/xpu/test_pool3d_op_xpu.py
index 865029ad0d07d..01dd6d77b2b86 100644
--- a/test/xpu/test_pool3d_op_xpu.py
+++ b/test/xpu/test_pool3d_op_xpu.py
@@ -68,8 +68,8 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
         padding_algorithm = padding_algorithm.upper()
         if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]:
             raise ValueError(
-                "Unknown Attr(padding_algorithm): '%s'. "
-                "It can only be 'SAME' or 'VALID'." % str(padding_algorithm)
+                f"Unknown Attr(padding_algorithm): '{str(padding_algorithm)}'. "
+                "It can only be 'SAME' or 'VALID'."
             )
 
         if padding_algorithm == "VALID":
diff --git a/test/xpu/test_swiglu_op_xpu.py b/test/xpu/test_swiglu_op_xpu.py
new file mode 100644
index 0000000000000..35d8350c85e26
--- /dev/null
+++ b/test/xpu/test_swiglu_op_xpu.py
@@ -0,0 +1,172 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+import paddle.nn.functional as F
+from paddle.incubate.nn.functional import swiglu as fused_swiglu_impl
+
+
+def swiglu(x, y, out_grad):
+    if isinstance(x, np.ndarray):
+        x = paddle.to_tensor(x)
+        y = paddle.to_tensor(y)
+        out_grad = paddle.to_tensor(out_grad)
+
+    origin_x = x.detach().clone()
+    origin_x.stop_gradient = False
+    x = origin_x
+
+    origin_y = y.detach().clone()
+    origin_y.stop_gradient = False
+    y = origin_y
+
+    dtype = x.dtype
+    need_convert = False
+    assert dtype == y.dtype
+    output_dtype = dtype
+
+    out = F.silu(x) * y
+    if need_convert:
+        out = out.astype(dtype)
+    out.backward(out_grad)
+    ret = [
+        out.astype(output_dtype),
+        origin_x.grad.astype(output_dtype),
+        origin_y.grad.astype(output_dtype),
+    ]
+    return ret
+
+
+def fused_swiglu(x, y, out_grad):
+    x = x.detach().clone()
+    x.stop_gradient = False
+    if y is not None:
+        y = y.detach().clone()
+        y.stop_gradient = False
+    out = fused_swiglu_impl(x, y)
+    out.backward(out_grad)
+
+    output_dtype = x.dtype
+    ret = [
+        out.astype(output_dtype),
+    ]
+    if y is not None:
+        x_grad, y_grad = x.grad, y.grad
+    else:
+        x_grad, y_grad = paddle.split(x.grad, 2, axis=-1)
+
+    ret.append(x_grad.astype(output_dtype))
+    ret.append(y_grad.astype(output_dtype))
+    return ret
+
+
+tol_map = {
+    paddle.float64: [1e-8, 1e-8],
+    paddle.float32: [1e-6, 1e-6],
+    paddle.float16: [1e-3, 1e-3],
+    paddle.bfloat16: [1e-2, 1e-2],
+}
+
+
+class TestSwiGLUDygraph(unittest.TestCase):
+    def setUp(self):
+        self.init_case()
+        self.seed = 1234
+
+    def init_case(self):
+        self.shape = []
+        self.shape.append([8, 100])
+        self.shape.append([4, 102])
+
+    def check_dygraph_impl(self, device, shape, dtype):
+        x = paddle.randn(shape, dtype=dtype)
+        y = paddle.randn(shape, dtype=dtype)
+        out_grad = paddle.randn(shape, dtype=dtype)
+
+        ret1 = swiglu(x, y, out_grad)
+        ret2 = fused_swiglu(x, y, out_grad)
+        ret3 = fused_swiglu(paddle.concat([x, y], axis=-1), None, out_grad)
+
+        atol, rtol = tol_map[dtype]
+        err_msg = (
+            f"Failed when device = {device}, dtype = {dtype}, shape = {shape}"
+        )
+        for t1, t2, t3 in zip(ret1, ret2, ret3):
+            t1, t2, t3 = t1.numpy(), t2.numpy(), t3.numpy()
+            np.testing.assert_allclose(
+                t1, t2, atol=atol, rtol=rtol, err_msg=err_msg
+            )
+            np.testing.assert_equal(t2, t3, err_msg=err_msg)
+
+    def check_dygraph(self, shape):
+        metas = []
+        metas.append(('xpu', paddle.float32))
+        metas.append(('xpu', paddle.float64))
+        # Enable in KL3
+        # metas.append(('xpu', paddle.float16))
+        # metas.append(('xpu', paddle.bfloat16))
+
+        for device, dtype in metas:
+            origin_device = paddle.get_device()
+            paddle.set_device(device)
+            for with_split in [True]:
+                self.check_dygraph_impl(device, shape, dtype)
+            paddle.set_device(origin_device)
+
+    def check_static_graph(self, shape, dtype="float32"):
+        x = paddle.static.data(name='x', shape=shape, dtype=dtype)
+        y = paddle.static.data(name='y', shape=shape, dtype=dtype)
+        concated_x = paddle.static.data(
+            name='concated_x',
+            shape=list(shape[:-1]) + [shape[-1] * 2],
+            dtype=dtype,
+        )
+        out1 = fused_swiglu_impl(x, y)
+        out2 = fused_swiglu_impl(concated_x)
+
+        concated_x_np = np.random.random(concated_x.shape).astype(dtype)
+        x_np, y_np = np.split(concated_x_np, 2, axis=-1)
+
+        exe = paddle.static.Executor()
+        t1, t2 = exe.run(
+            feed={'x': x_np, 'y': y_np, 'concated_x': concated_x_np},
+            fetch_list=[out1, out2],
+        )
+        np.testing.assert_equal(out1, out2)
+
+    def check_main(self, shape):
+        self.check_dygraph(shape)
+        paddle.enable_static()
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            self.check_static_graph(shape)
+        paddle.disable_static()
+
+    def test_main(self):
+        for i in self.shape:
+            self.check_main(i)
+
+
+class TestSwigluOp(TestSwiGLUDygraph):
+    def init_case(self):
+        self.shape = [[1, 4096, 1376], [1, 4096, 11008]]
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/xpu/test_zero_dim_tensor_xpu.py b/test/xpu/test_zero_dim_tensor_xpu.py
index 133c9b1302013..ac5e2df75b46f 100644
--- a/test/xpu/test_zero_dim_tensor_xpu.py
+++ b/test/xpu/test_zero_dim_tensor_xpu.py
@@ -345,7 +345,7 @@ def test_dygraph_binary(self):
             # 1) x is 0D, y is 0D
             x_np = np.random.randint(-10, 10, [])
             y_np = np.random.randint(-10, 10, [])
-            out_np = eval('np.%s(x_np, y_np)' % api.__name__)
+            out_np = eval(f'np.{api.__name__}(x_np, y_np)')
 
             x = paddle.to_tensor(x_np)
             y = paddle.to_tensor(y_np)
@@ -357,7 +357,7 @@ def test_dygraph_binary(self):
             # 2) x is ND, y is 0D
             x_np = np.random.randint(-10, 10, [3, 5])
             y_np = np.random.randint(-10, 10, [])
-            out_np = eval('np.%s(x_np, y_np)' % api.__name__)
+            out_np = eval(f'np.{api.__name__}(x_np, y_np)')
 
             x = paddle.to_tensor(x_np)
             y = paddle.to_tensor(y_np)
@@ -369,7 +369,7 @@ def test_dygraph_binary(self):
             # 3) x is 0D , y is ND
             x_np = np.random.randint(-10, 10, [])
             y_np = np.random.randint(-10, 10, [3, 5])
-            out_np = eval('np.%s(x_np, y_np)' % api.__name__)
+            out_np = eval(f'np.{api.__name__}(x_np, y_np)')
 
             x = paddle.to_tensor(x_np)
             y = paddle.to_tensor(y_np)
diff --git a/third_party/onednn b/third_party/onednn
index 01204edbda1c2..0fb7e6ed4f32e 160000
--- a/third_party/onednn
+++ b/third_party/onednn
@@ -1 +1 @@
-Subproject commit 01204edbda1c2a4ff0cccd40476ed6bd2fb62d56
+Subproject commit 0fb7e6ed4f32e5d89832b2bd742bbf834cd296ed
diff --git a/tools/CheckPRTemplate.py b/tools/CheckPRTemplate.py
index 1cc601dba0a29..a3a350d107af6 100644
--- a/tools/CheckPRTemplate.py
+++ b/tools/CheckPRTemplate.py
@@ -79,7 +79,7 @@ def parameter_accuracy(body):
             for i in value:
                 i = i.strip().lower()
                 if i not in test_list_lower:
-                    single_mess += '%s.' % i
+                    single_mess += f'{i}.'
             if len(single_mess) != 0:
                 message += f'{key} should be in {test_list}. but now is [{single_mess}].'
     return message
diff --git a/tools/CrossStackProfiler/CspFileReader.py b/tools/CrossStackProfiler/CspFileReader.py
index 28038b5c76d3b..5802edc965cca 100755
--- a/tools/CrossStackProfiler/CspFileReader.py
+++ b/tools/CrossStackProfiler/CspFileReader.py
@@ -108,7 +108,7 @@ def printArgs(self):
 
     def _checkArgsKey(self, key, type):
         if key not in self._args:
-            raise KeyError("args should has key [%s]!" % key)
+            raise KeyError(f"args should has key [{key}]!")
 
         if not isinstance(self._args[key], type):
             raise TypeError(
@@ -130,17 +130,14 @@ def _checkArgs(self):
             or self._organizeForm == FILEORGANIZEFORM_BYOTHER
         ):
             raise NotImplementedError(
-                "we have not known how to process this form of file [%s]!"
-                % self._organizeForm
+                f"we have not known how to process this form of file [{self._organizeForm}]!"
             )
 
         self._checkArgsKey("gpuPerTrainer", int)
 
         self._checkArgsKey("dataPath", str)
         if not os.path.exists(self._dataPath):
-            raise OSError(
-                "input data path [%s] not existed!" % (self._dataPath)
-            )
+            raise OSError(f"input data path [{self._dataPath}] not existed!")
 
         self._checkArgsKey("groupSize", int)
         self._checkArgsKey("displaySize", int)
@@ -183,8 +180,7 @@ def _getFileList(self):
                 newFileList.append(file)
             else:
                 raise NotImplementedError(
-                    "[%s] is repeated by id, we don not how to process it!"
-                    % file
+                    f"[{file}] is repeated by id, we don not how to process it!"
                 )
 
         if not self._fileList:
@@ -201,7 +197,7 @@ def _sortBySuffix(elem):
 
         if not self._fileList:
             self._logger.warning(
-                "we can not find any file in dir [%s]!" % self._dataPath
+                f"we can not find any file in dir [{self._dataPath}]!"
             )
         else:
             self._logger.info(
@@ -215,12 +211,11 @@ def _sortBySuffix(elem):
     def _getId(self, fileName, organizeForm, sed="."):
         if self._organizeForm != organizeForm:
             raise TypeError(
-                "Can not get rank id when organizer form is not %s!"
-                % organizeForm
+                f"Can not get rank id when organizer form is not {organizeForm}!"
             )
 
         if not os.path.isfile(fileName):
-            raise OSError("[%s] is not a valid file!" % (fileName))
+            raise OSError(f"[{fileName}] is not a valid file!")
 
         try:
             prefix_str = fileName.split(sed)[-1]
@@ -228,13 +223,12 @@ def _getId(self, fileName, organizeForm, sed="."):
                 return int(prefix_str)
             except ValueError as e:
                 print(e)
-                raise TypeError("invalid fileName [%s]" % fileName)
+                raise TypeError(f"invalid fileName [{fileName}]")
 
         except IndexError as e:
             print(e)
             raise TypeError(
-                "invalid fileName [%s], the prefix should be a number!"
-                % fileName
+                f"invalid fileName [{fileName}], the prefix should be a number!"
             )
 
     def getRankId(self, fileName, sed="."):
@@ -298,19 +292,15 @@ def getDcgmInfoDict(self, groupId, gpuId, tmpPath="./tmp"):
     def getDict(self, name, groupId, gpuId, tmpPath="./tmp"):
         fileName = self.getFileName(name, groupId, gpuId, tmpPath)
         if not os.path.isfile(fileName):
-            raise OSError("[%s] is not existed!" % fileName)
+            raise OSError(f"[{fileName}] is not existed!")
 
         data = {}
         with open(fileName, "r") as rf:
             try:
                 data = json.load(rf)
             except Exception:
-                self._logger.error(
-                    "read [%s] error. not a json file!" % (fileName)
-                )
-                raise TypeError(
-                    "read [%s] error. not a json file!" % (fileName)
-                )
+                self._logger.error(f"read [{fileName}] error. not a json file!")
+                raise TypeError(f"read [{fileName}] error. not a json file!")
         return data
 
     def dumpOpInfoDict(
@@ -344,7 +334,7 @@ def dumpDict(
         fileObject = open(fileName, 'w')
         fileObject.write(jsObj)
         fileObject.close()
-        self._logger.info("dump [%s] successfully!" % fileName)
+        self._logger.info(f"dump [{fileName}] successfully!")
 
 
 def getLogger():
diff --git a/tools/CrossStackProfiler/DCGMFileReader.py b/tools/CrossStackProfiler/DCGMFileReader.py
index f462ce5c9ad5e..eb31ad7820a78 100755
--- a/tools/CrossStackProfiler/DCGMFileReader.py
+++ b/tools/CrossStackProfiler/DCGMFileReader.py
@@ -88,7 +88,7 @@ def parseFileByGroup(self, groupId, processNum=8):
     def _parseTask(self, taskList, q=None):
         is_first = True
         for fileName in taskList:
-            self._logger.info("I am processing %s!" % fileName)
+            self._logger.info(f"I am processing {fileName}!")
             tmp_data = self._parseSingleFile(fileName)
             if tmp_data is None:
                 continue
@@ -103,7 +103,7 @@ def _parseTask(self, taskList, q=None):
         dcgm_data = dcgm_data.dropna()
         if q is not None:
             q.put(dcgm_data)
-        self._logger.info("I finish processing %s!" % fileName)
+        self._logger.info(f"I finish processing {fileName}!")
         return dcgm_data
 
     def _parseSingleFile(self, fileName):
@@ -192,7 +192,7 @@ def _getDCGMTraceInfoByGpuId(
 
                 di = {}
                 # name = "%s_%d" % (metric, trainerId)
-                name = "%s" % (metric)
+                name = f"{metric}"
                 di['name'] = name
                 di['pid'] = pid_map[metric]
                 di['ts'] = self._align_ts(int(row['ts']))
diff --git a/tools/CrossStackProfiler/ProfileFileReader.py b/tools/CrossStackProfiler/ProfileFileReader.py
index af955bd6652c4..266e9e5cf706d 100755
--- a/tools/CrossStackProfiler/ProfileFileReader.py
+++ b/tools/CrossStackProfiler/ProfileFileReader.py
@@ -46,7 +46,7 @@ def _parseTask(self, taskList, q=None):
             profile_dict["trainerRank.%03d" % (rankId)] = self._parseSingleFile(
                 fileName
             )
-            self._logger.info("I finish processing %s!" % fileName)
+            self._logger.info(f"I finish processing {fileName}!")
 
         if q is not None:
             q.put(profile_dict)
diff --git a/tools/analysisPyXml.py b/tools/analysisPyXml.py
index 2f2d8b472c566..9d9ec062180cb 100644
--- a/tools/analysisPyXml.py
+++ b/tools/analysisPyXml.py
@@ -31,7 +31,7 @@ def analysisPyXml(rootPath, ut):
     for clazz in root.findall('packages/package/classes/class'):
         clazz_filename = clazz.attrib.get('filename')
         if not clazz_filename.startswith('/paddle'):
-            clazz_filename = '/paddle/%s' % clazz_filename
+            clazz_filename = f'/paddle/{clazz_filename}'
         for line in clazz.findall('lines/line'):
             line_hits = int(line.attrib.get('hits'))
             if line_hits != 0:
diff --git a/tools/analysis_build_time.py b/tools/analysis_build_time.py
index 6ae3ee6bbacc1..ae340a1bcfe03 100644
--- a/tools/analysis_build_time.py
+++ b/tools/analysis_build_time.py
@@ -33,10 +33,10 @@ def getUsefulBuildTimeFile(filename):
 
 
 def analysisBuildTime():
-    filename = '%s/build/build-time' % root_path
+    filename = f'{root_path}/build/build-time'
     getUsefulBuildTimeFile(filename)
-    os.system('rm -rf %s/tools/tempbuildTime.txt' % root_path)
-    with open('%s/tools/analysis_build_time' % root_path, 'r') as f:
+    os.system(f'rm -rf {root_path}/tools/tempbuildTime.txt')
+    with open(f'{root_path}/tools/analysis_build_time', 'r') as f:
         lines = f.readlines()
         for line in lines:
             try:
diff --git a/tools/check_added_ut.sh b/tools/check_added_ut.sh
index b036c08e1d93e..6d422774d12ed 100644
--- a/tools/check_added_ut.sh
+++ b/tools/check_added_ut.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -32,7 +32,7 @@ if [[ "$SYSTEM" == "Linux" ]] || [[ "$SYSTEM" == "Darwin" ]];then
     cp $PADDLE_ROOT/paddle/scripts/paddle_build.sh $PADDLE_ROOT/paddle/scripts/paddle_build_pre.sh
 elif [[ "$SYSTEM" == "Windows_NT" ]];then
     git remote | grep upstream
-    if [ $? != 0 ]; then 
+    if [ $? != 0 ]; then
         git remote add upstream https://github.com/PaddlePaddle/Paddle.git
     fi
     git fetch upstream ${BRANCH}
diff --git a/tools/check_api_approvals.sh b/tools/check_api_approvals.sh
index 4a8e7cf708994..c32e3c99f45a9 100644
--- a/tools/check_api_approvals.sh
+++ b/tools/check_api_approvals.sh
@@ -40,12 +40,18 @@ function add_failed(){
 
 api_params_diff=`python ${PADDLE_ROOT}/tools/check_api_compatible.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.spec  ${PADDLE_ROOT}/paddle/fluid/API_PR.spec`
 api_spec_diff=`python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.spec.api  ${PADDLE_ROOT}/paddle/fluid/API_PR.spec.api`
+api_annotation_diff=`python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.spec.annotations  ${PADDLE_ROOT}/paddle/fluid/API_PR.spec.annotations`
 if [ "$api_spec_diff" != "" -o "${api_params_diff}" != "" ]; then
     echo_line="You must have one RD (XiaoguangHu01, jeff41404, lanxianghit or qingqing01) approval for API change.\n"
 
     check_approval 1 XiaoguangHu01 jeff41404 lanxianghit qingqing01
 fi
 
+if [ "$api_annotation_diff" != "" ]; then
+    echo_line="You must have one member of Typing group (SigureMo, megemini, zrr1999, sunzhongkai588, luotao1) approval for API annotation change.\n"
+    check_approval 1 SigureMo, megemini, zrr1999, sunzhongkai588, luotao1
+fi
+
 api_yaml_diff=`python ${PADDLE_ROOT}/tools/check_api_yaml_same.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.spec  ${PADDLE_ROOT}/paddle/fluid/API_PR.spec ${BRANCH} ${PADDLE_ROOT}`
 if [ "$api_yaml_diff" != "" ]; then
     echo_line="API's name and params should be consistent with op's name and params in yaml.
@@ -133,7 +139,7 @@ if [ -n "${echo_list}" ];then
   echo "**************************************************************"
 
   # L40 L48 L62 has fetch the result out, but there are splitted.
-  if [ "${api_spec_diff}" != "" -o "${api_doc_spec_diff}" != "" ] ; then
+  if [ "${api_spec_diff}" != "" -o "${api_annotation_diff}" != "" ] ; then
     python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.spec  ${PADDLE_ROOT}/paddle/fluid/API_PR.spec
   fi
   if [ "${api_params_diff}" != "" ] ; then
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index d637c4f0c3b82..c844c09565da3 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -21,11 +21,12 @@ fi
 
 PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../" && pwd )"
 # If you want to add monitoring file modifications, please perform the. github/CODEOWNERS operation
-API_FILES=("tools/print_signatures.py"
-           "tools/sampcd_processor.py"
-           "tools/check_pr_approval.py"
-	   "tools/checkout_api_compatible.py"
-           )
+API_FILES=(
+    "tools/print_signatures.py"
+    "tools/sampcd_processor.py"
+    "tools/check_pr_approval.py"
+    "tools/checkout_api_compatible.py"
+)
 
 approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
 git_files=`git diff --numstat upstream/$BRANCH| wc -l`
diff --git a/tools/check_op_benchmark_result.py b/tools/check_op_benchmark_result.py
index ca3df4bb99eef..82f7967133576 100644
--- a/tools/check_op_benchmark_result.py
+++ b/tools/check_op_benchmark_result.py
@@ -21,7 +21,7 @@
 
 def check_path_exists(path):
     """Assert whether file/directory exists."""
-    assert os.path.exists(path), "%s does not exist." % path
+    assert os.path.exists(path), f"{path} does not exist."
 
 
 def parse_case_name(log_file_name):
@@ -48,7 +48,7 @@ def parse_log_file(log_file):
                 pass  # do nothing
 
     if result is None:
-        logging.warning("Parse %s fail!" % log_file)
+        logging.warning(f"Parse {log_file} fail!")
 
     return result
 
@@ -81,29 +81,29 @@ def check_speed_result(case_name, develop_data, pr_data, pr_result):
     develop_total_time = develop_data.get("total")
     total_time_diff = (pr_total_time - develop_total_time) / develop_total_time
 
-    logging.info("------ OP: %s ------" % case_name)
+    logging.info(f"------ OP: {case_name} ------")
     logging.info(
         f"GPU time change: {gpu_time_diff_str} (develop: {develop_gpu_time:.7f} -> PR: {pr_gpu_time:.7f})"
     )
     logging.info(
         f"Total time change: {total_time_diff * 100:.5f}% (develop: {develop_total_time:.7f} -> PR: {pr_total_time:.7f})"
     )
-    logging.info("backward: %s" % pr_result.get("backward"))
+    logging.info("backward: {}".format(pr_result.get("backward")))
     logging.info("parameters:")
     for line in pr_result.get("parameters").strip().split("\n"):
-        logging.info("\t%s" % line)
+        logging.info(f"\t{line}")
 
     return gpu_time_diff > 0.05
 
 
 def check_accuracy_result(case_name, pr_result):
     """Check accuracy result."""
-    logging.info("------ OP: %s ------" % case_name)
-    logging.info("Accuracy diff: %s" % pr_result.get("diff"))
-    logging.info("backward: %s" % pr_result.get("backward"))
+    logging.info(f"------ OP: {case_name} ------")
+    logging.info("Accuracy diff: {}".format(pr_result.get("diff")))
+    logging.info("backward: {}".format(pr_result.get("backward")))
     logging.info("parameters:")
     for line in pr_result.get("parameters").strip().split("\n"):
-        logging.info("\t%s" % line)
+        logging.info(f"\t{line}")
 
     return not pr_result.get("consistent")
 
@@ -154,11 +154,11 @@ def update_api_info_file(fail_case_list, api_info_file):
 def summary_results(check_results, api_info_file):
     """Summary results and return sys.exit code."""
     for case_name in check_results["speed"]:
-        logging.error("Check speed result with case \"%s\" failed." % case_name)
+        logging.error(f"Check speed result with case \"{case_name}\" failed.")
 
     for case_name in check_results["accuracy"]:
         logging.error(
-            "Check accuracy result with case \"%s\" failed." % case_name
+            f"Check accuracy result with case \"{case_name}\" failed."
         )
 
     if len(check_results["speed"]) and api_info_file:
diff --git a/tools/check_sequence_op.sh b/tools/check_sequence_op.sh
index 35357476a3224..51a482c3e9306 100644
--- a/tools/check_sequence_op.sh
+++ b/tools/check_sequence_op.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/tools/ci_op_benchmark.sh b/tools/ci_op_benchmark.sh
index 58e327327e6ad..93eb52a4f16aa 100644
--- a/tools/ci_op_benchmark.sh
+++ b/tools/ci_op_benchmark.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -333,6 +333,6 @@ fi
 case $1 in
   run_op_benchmark)
     prepare_env
-    gpu_op_benchmark 
+    gpu_op_benchmark
   ;;
 esac
diff --git a/tools/cinn/gen_c++_tutorial.py b/tools/cinn/gen_c++_tutorial.py
index 97e6d16fef088..be391b44ef730 100644
--- a/tools/cinn/gen_c++_tutorial.py
+++ b/tools/cinn/gen_c++_tutorial.py
@@ -59,13 +59,13 @@ def code_block(self, lang: str, block: List[str]):
                 break
             else:
                 tail_valid_offset += 1
-        logging.warning("block0: %s" % block)
+        logging.warning(f"block0: {block}")
         block = (
             block[pre_valid_offset:-tail_valid_offset]
             if tail_valid_offset > 0
             else block[pre_valid_offset:]
         )
-        logging.warning("block1: %s" % block)
+        logging.warning(f"block1: {block}")
         if not block:
             return
 
@@ -189,7 +189,7 @@ def eat_roc(self, header: str, content: ContentGenerator) -> None:
             code_block.append(line)
             line: str = content.get_line()
 
-        logging.warning("DOC content: %s" % code_block)
+        logging.warning(f"DOC content: {code_block}")
 
         self.doc.code_block(lang, code_block)
 
diff --git a/tools/codestyle/clang-tidy.py b/tools/codestyle/clang-tidy.py
index 404413b9b9945..7fe5029cd1823 100644
--- a/tools/codestyle/clang-tidy.py
+++ b/tools/codestyle/clang-tidy.py
@@ -166,9 +166,9 @@ def get_tidy_invocation(
         os.close(handle)
         start.append(name)
     for arg in extra_arg:
-        start.append('-extra-arg=%s' % arg)
+        start.append(f'-extra-arg={arg}')
     for arg in extra_arg_before:
-        start.append('-extra-arg-before=%s' % arg)
+        start.append(f'-extra-arg-before={arg}')
     start.append('-p=' + build_path)
     if quiet:
         start.append('-quiet')
diff --git a/tools/continuous_integration/bisect.py b/tools/continuous_integration/bisect.py
index c4b31bb6e8729..2feaf7be5ec6e 100644
--- a/tools/continuous_integration/bisect.py
+++ b/tools/continuous_integration/bisect.py
@@ -84,11 +84,11 @@ def print_arguments():
     [f'git rev-list --first-parent {args.good_commit}...{args.bad_commit}'],
     shell=True,
 )
-sys.stdout.write('commits found:\n%s\n' % ret)
+sys.stdout.write(f'commits found:\n{ret}\n')
 commits = ret.strip().split('\n')
 os.chdir(args.build_dir)
 # Clean up previous logs.
-subprocess.check_output(['echo "" > %s' % args.log_file], shell=True)
+subprocess.check_output([f'echo "" > {args.log_file}'], shell=True)
 
 last_culprit = ''
 while True:
@@ -96,8 +96,7 @@ def print_arguments():
     os.chdir(args.git_dir)
     subprocess.check_output(
         [
-            'git checkout %s && git clean -fd && git checkout .'
-            % args.bisect_branch
+            f'git checkout {args.bisect_branch} && git clean -fd && git checkout .'
         ],
         shell=True,
     )
@@ -109,7 +108,7 @@ def print_arguments():
     pick_idx = len(commits) / 2
     pick = commits[pick_idx]
     os.chdir(args.git_dir)
-    subprocess.check_output(['git checkout %s' % pick], shell=True)
+    subprocess.check_output([f'git checkout {pick}'], shell=True)
 
     # Clean builds and compile.
     # We assume mainline commits should always compile.
@@ -120,7 +119,7 @@ def print_arguments():
         'rm -rf * && '
         f'cmake -DWITH_TESTING=ON {args.git_dir} >> {args.log_file} && make -j{args.build_parallel} >> {args.log_file}'
     )
-    sys.stdout.write('cmd: %s\n' % cmd)
+    sys.stdout.write(f'cmd: {cmd}\n')
     try:
         subprocess.check_output([cmd], shell=True)
     except subprocess.CalledProcessError as e:
@@ -130,7 +129,7 @@ def print_arguments():
     passed = True
     try:
         cmd = f'ctest --repeat-until-fail {args.test_times} -R {args.test_target} >> {args.log_file}'
-        sys.stdout.write('cmd: %s\n' % cmd)
+        sys.stdout.write(f'cmd: {cmd}\n')
         subprocess.check_output([cmd], shell=True)
     except subprocess.CalledProcessError as e:
         passed = False
@@ -145,4 +144,4 @@ def print_arguments():
             break
         commits = commits[pick_idx + 1 :]
 
-sys.stdout.write('Culprit commit: %s\n' % last_culprit)
+sys.stdout.write(f'Culprit commit: {last_culprit}\n')
diff --git a/tools/document_preview.sh b/tools/document_preview.sh
index 47c5207074046..97c01ee96d03b 100755
--- a/tools/document_preview.sh
+++ b/tools/document_preview.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -45,7 +45,7 @@ function get_docs_pr_num_from_paddle_pr_info(){
 }
 
 # Attention:
-# 1. /FluidDoc will be used as the workspace of PaddlePaddle/docs. 
+# 1. /FluidDoc will be used as the workspace of PaddlePaddle/docs.
 # 2. And /docs is used as the output of doc-build process.
 # 3. If conflicted with yours, please modify the definition of FLUIDDOCDIR and
 #    OUTPUTDIR in the subsequent codes.
diff --git a/tools/enforce/count_enforce_by_dir.sh b/tools/enforce/count_enforce_by_dir.sh
index 77ffe9c158c7d..ba419f77f2bc1 100644
--- a/tools/enforce/count_enforce_by_dir.sh
+++ b/tools/enforce/count_enforce_by_dir.sh
@@ -15,10 +15,10 @@
 # limitations under the License.
 
 # This script is used to count detail PADDLE checks in the paddle/fluid directory,
-#   contains the number of PADDLE checks under each folder, the statistical data 
+#   contains the number of PADDLE checks under each folder, the statistical data
 #   does not include subdirectories, only covers all files under the current directory.
-#   
-#   The three columns of data are: total number, valid number, invalid number. 
+#
+#   The three columns of data are: total number, valid number, invalid number.
 #   The output format is easy to display as a markdown table.
 
 # Usage: bash count_enforce_by_dir.sh (run in tools directory)
@@ -70,8 +70,8 @@ function count_dir_independently(){
             enforce_count $1"/"$file dir_total_check_cnt dir_valid_check_cnt
             sub_dir_total_check_cnt=$(($sub_dir_total_check_cnt+$dir_total_check_cnt))
             sub_dir_valid_check_cnt=$(($sub_dir_valid_check_cnt+$dir_valid_check_cnt))
-            
-            count_dir_independently $1"/"$file $dir_total_check_cnt $dir_valid_check_cnt 
+
+            count_dir_independently $1"/"$file $dir_total_check_cnt $dir_valid_check_cnt
         fi
     done
     total_check_cnt=$(($2-$sub_dir_total_check_cnt))
diff --git a/tools/enforce/count_enforce_by_file.sh b/tools/enforce/count_enforce_by_file.sh
index c79d486c62838..b06514a4e03bb 100644
--- a/tools/enforce/count_enforce_by_file.sh
+++ b/tools/enforce/count_enforce_by_file.sh
@@ -16,8 +16,8 @@
 
 # This script is used to count PADDLE checks by files in the paddle/fluid/operators directory,
 #   contains the number of PADDLE checks under each file.
-#   
-#   The three columns of data are: total number, valid number, invalid number. 
+#
+#   The three columns of data are: total number, valid number, invalid number.
 #   The output format is easy to display as a markdown table.
 
 # Usage: bash count_enforce_by_file.sh  [target directory or file] (run in tools directory)
diff --git a/tools/externalError/start.sh b/tools/externalError/start.sh
index d60a26d157cce..057a67ef46a41 100644
--- a/tools/externalError/start.sh
+++ b/tools/externalError/start.sh
@@ -1,13 +1,13 @@
 #!/usr/bin/env bash
 
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/tools/final_ut_parallel_rule.py b/tools/final_ut_parallel_rule.py
index e0fc86c19a8cc..2dbfdd39c1a2c 100644
--- a/tools/final_ut_parallel_rule.py
+++ b/tools/final_ut_parallel_rule.py
@@ -19,7 +19,7 @@
 
 def classify_cases_by_mem(rootPath):
     """classify cases by mem"""
-    case_filename = '%s/build/classify_case_by_cardNum.txt' % rootPath
+    case_filename = f'{rootPath}/build/classify_case_by_cardNum.txt'
     case_exec_100 = [
         'test_conv_eltwiseadd_bn_fuse_pass',
         'test_trt_convert_pool2d',
@@ -124,14 +124,14 @@ def classify_cases_by_mem(rootPath):
             else:
                 case_mem_1[case] = new_lastest_mem[case]["mem_nvidia"]
 
-        with open('/pre_test/%s_mem0' % cardType, 'w') as f:
+        with open(f'/pre_test/{cardType}_mem0', 'w') as f:
             f.write(case_mem_0)
             f.close()
 
         case_mem_1_sort = sorted(case_mem_1.items(), key=lambda x: x[1])
         case_mem_1_line = '^job$'
         mem_1_sum = 0
-        with open('/pre_test/%s' % cardType, 'w') as f_not_0:
+        with open(f'/pre_test/{cardType}', 'w') as f_not_0:
             for index in case_mem_1_sort:
                 if mem_1_sum < 14 * 1024 * 2:
                     mem_1_sum += index[1]
@@ -150,7 +150,7 @@ def classify_cases_by_mem(rootPath):
                     f_not_0.write(case_mem_1_line + '\n')
             f_not_0.close()
 
-    os.system('cp %s/build/nightly_case /pre_test/' % rootPath)
+    os.system(f'cp {rootPath}/build/nightly_case /pre_test/')
 
 
 if __name__ == '__main__':
diff --git a/tools/gen_alias_mapping.sh b/tools/gen_alias_mapping.sh
index 3ab1e68b37557..c57f3f6bba2b1 100755
--- a/tools/gen_alias_mapping.sh
+++ b/tools/gen_alias_mapping.sh
@@ -17,16 +17,16 @@
 # Brief:
 #     This code is used for generating the mapping list of Paddle API alias.
 #     Only the APIs set with the `DEFINE_ALIAS` flag is enable.
-# 
+#
 # Arguments:
 #     None
-# 
+#
 # Usage:
-#     Go into the `Paddle` folder and just run `./tools/gen_alias_mapping.sh`     
+#     Go into the `Paddle` folder and just run `./tools/gen_alias_mapping.sh`
 #
 # Returns:
 #     succ: 0
-# 
+#
 #     Will also print the mapping list to stdout. The format of each line is as below:
 #         <real API implement>\t<API recommend>,<API other alias name1>,<API other alias name2>,...
 
@@ -38,7 +38,7 @@ find ${PADDLE_ROOT}/python/ -name '*.py' \
     | grep 'DEFINE_ALIAS' \
     | perl -ne '
         if (/\/python\/(.*):from (\.*)(\w.*) import (.*?)\s+#DEFINE_ALIAS\s+$/) {
-            my @arr = split(", ", $4); 
+            my @arr = split(", ", $4);
             foreach $i (@arr) {
                 printf "%s|%s|%s|%d\n", $3, $i, substr($1, 0, -3), length($2);
             }
@@ -66,7 +66,7 @@ find ${PADDLE_ROOT}/python/ -name '*.py' \
             }
             key = key""new;
             n2o[key] = val;
-        } 
+        }
         END {
             for (new in n2o) {
                 old = n2o[new] in n2o ? n2o[n2o[new]] : n2o[new];
@@ -78,7 +78,7 @@ find ${PADDLE_ROOT}/python/ -name '*.py' \
         {
             o2n[$1] = o2n[$1] ? o2n[$1]","$3 : $3;
         }
-        END { 
+        END {
             for (i in o2n) {
                 print i"\t"o2n[i];
             }
diff --git a/tools/gen_tensor_stub.py b/tools/gen_tensor_stub.py
index 00c7fb0c2e50c..422b3004f5266 100644
--- a/tools/gen_tensor_stub.py
+++ b/tools/gen_tensor_stub.py
@@ -15,17 +15,18 @@
 from __future__ import annotations
 
 import argparse
+import importlib
 import inspect
 import logging
 import re
+import sys
+import types
 from dataclasses import dataclass
 from functools import cached_property, lru_cache
 from typing import Any, Callable, Literal
 
 from typing_extensions import TypeAlias
 
-import paddle
-
 logging.basicConfig(style="{", format="{message}", level=logging.INFO)
 logger = logging.getLogger("Generating stub file for paddle.Tensor")
 logger.setLevel(logging.INFO)
@@ -102,7 +103,6 @@ def find_apis(self, api_name: str) -> list[dict[str, tuple[str, int, int]]]:
         api = []
         for mo in pattern.finditer(self._template):
             _indent = mo.group('indent')
-            _def_api = mo.group('def_api')
             _signature = mo.group('signature')
             _docstring = mo.group('docstring')
             _ellipsis = mo.group('ellipsis')
@@ -110,26 +110,15 @@ def find_apis(self, api_name: str) -> list[dict[str, tuple[str, int, int]]]:
             _comment = '' if _comment is None else _comment
 
             _start_index, _end_index = mo.span()
-
-            _start_indent = _start_index
-            _end_indent = _start_indent + len(_indent)
-
-            _start_def_api = _end_indent
-            _end_def_api = _start_def_api + len(_def_api)
-
-            _start_signature = _end_def_api
-            _end_signature = _start_signature + len(_signature)
-
-            _start_docstring = _end_signature
-            _end_docstring = _start_docstring + len(_docstring)
-
-            _start_ellipsis = _end_docstring
-            _end_ellipsis = _start_ellipsis + len(_ellipsis)
-
+            _start_indent, _end_indent = mo.span('indent')
+            _start_signature, _end_signature = mo.span('signature')
+            _start_docstring, _end_docstring = mo.span('docstring')
+            _start_ellipsis, _end_ellipsis = mo.span('ellipsis')
             _start_comment = _end_ellipsis
             _end_comment = _start_comment + len(_comment)
 
-            assert _end_index == _end_comment
+            assert _start_index == _start_indent
+            assert _end_comment == _end_index
 
             _api = {
                 'indent': (_indent, _start_indent, _end_indent),
@@ -216,7 +205,10 @@ def add_doc(self, doc: str):
         self.insert_template(docstring, _end_index, _end_index)
 
     def codegen(self) -> str:
-        return self._template
+        header = (
+            '# This file is auto generated by `tools/gen_tensor_stub.py`.\n\n'
+        )
+        return header + self._template
 
 
 def is_inherited_member(name: str, cls: type) -> bool:
@@ -336,7 +328,27 @@ def func_doc_to_method_doc(func_doc: str) -> str:
     return method_doc
 
 
+def try_import_paddle() -> types.ModuleType | None:
+    try:
+        return importlib.import_module('paddle')
+    except ModuleNotFoundError:
+        sys.stderr.write(
+            '''ERROR: Can NOT import paddle.
+            We could import paddle without installation, with all libs (.dll or .so) copied into dir `paddle/libs`,
+            or path already been set for the system.
+            '''
+        )
+
+
 def get_tensor_members():
+    paddle = try_import_paddle()
+    if not paddle:
+        raise (
+            ModuleNotFoundError(
+                'Can NOT import paddle from tools/gen_tensor_stub.py.'
+            )
+        )
+
     tensor_class = paddle.Tensor
 
     members: dict[int, Member] = {}
@@ -433,7 +445,7 @@ def get_tensor_template(path: str) -> str:
         return ''.join(f.readlines())
 
 
-def main():
+def parse_args():
     parser = argparse.ArgumentParser()
 
     parser.add_argument(
@@ -442,7 +454,6 @@ def main():
         type=str,
         default="python/paddle/tensor/tensor.prototype.pyi",
     )
-
     parser.add_argument(
         "-o",
         "--output-file",
@@ -452,12 +463,16 @@ def main():
 
     args = parser.parse_args()
 
+    return args
+
+
+def generate_stub_file(input_file=None, output_file=None):
     # Get members of Tensor
     tensor_members = get_tensor_members()
     logging.debug(f'total members in Tensor: {len(tensor_members)}')
 
     # Get tensor template
-    tensor_template = get_tensor_template(args.input_file)
+    tensor_template = get_tensor_template(input_file)
 
     # Generate the Tensor stub
     tensor_gen = TensorGen(tensor_template)
@@ -473,9 +488,14 @@ def main():
             tensor_gen.add_doc(member.doc)
 
     # Write to target file
-    with open(args.output_file, "w", encoding="utf-8") as f:
+    with open(output_file, "w", encoding="utf-8") as f:
         f.write(tensor_gen.codegen())
 
 
+def main():
+    args = parse_args()
+    generate_stub_file(args.input_file, args.output_file)
+
+
 if __name__ == "__main__":
     main()
diff --git a/tools/get_build_time.sh b/tools/get_build_time.sh
index 496c8c12d6ca3..85100bb50c761 100755
--- a/tools/get_build_time.sh
+++ b/tools/get_build_time.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/tools/get_cpu_info.sh b/tools/get_cpu_info.sh
index bce338a8619e6..b7ec2e77a3a84 100755
--- a/tools/get_cpu_info.sh
+++ b/tools/get_cpu_info.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -54,7 +54,7 @@ echo "OS Version             : `uname -o`"
 echo "Kernel Release Version : `uname -r`"
 echo "Kernel Patch Version   : `uname -v`"
 echo "GCC Version            :`gcc --version | head -n 1|awk -F '\\\(GCC\\\)' '{print $2}'`"
-if command -v cmake >/dev/null 2>&1; then 
+if command -v cmake >/dev/null 2>&1; then
   cmake_ver=`cmake --version | head -n 1 | awk -F 'version' '{print $2}'`
 else
   cmake_ver=" Not installed"
diff --git a/tools/get_op_list.sh b/tools/get_op_list.sh
index 2e4cad13582df..2b5d7f419b1d2 100644
--- a/tools/get_op_list.sh
+++ b/tools/get_op_list.sh
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/tools/get_ut_file_map.py b/tools/get_ut_file_map.py
index bf469eab98747..42b1c251f19a1 100644
--- a/tools/get_ut_file_map.py
+++ b/tools/get_ut_file_map.py
@@ -19,8 +19,8 @@
 
 def get_all_paddle_file(rootPath):
     """get all file in Paddle repo: paddle/fluild, python"""
-    traverse_files = ['%s' % rootPath]
-    all_file_paddle = '%s/build/all_file_paddle' % rootPath
+    traverse_files = [f'{rootPath}']
+    all_file_paddle = f'{rootPath}/build/all_file_paddle'
     all_file_paddle_list = []
     with open(all_file_paddle, 'w') as f:
         for filename in traverse_files:
@@ -32,7 +32,7 @@ def get_all_paddle_file(rootPath):
 
 
 def get_all_uts(rootPath):
-    all_uts_paddle = '%s/build/all_uts_paddle' % rootPath
+    all_uts_paddle = f'{rootPath}/build/all_uts_paddle'
     os.system(
         fr'cd {rootPath}/build && ctest -N -V | grep -Ei "Test[ \t]+#" | grep -oEi "\w+$" > {all_uts_paddle}'
     )
@@ -42,28 +42,28 @@ def remove_useless_file(rootPath):
     """remove useless file in ut_file_map.json"""
     all_file_paddle_list = get_all_paddle_file(rootPath)
     ut_file_map_new = {}
-    ut_file_map = "%s/build/ut_file_map.json" % rootPath
+    ut_file_map = f"{rootPath}/build/ut_file_map.json"
     with open(ut_file_map, 'r') as load_f:
         load_dict = json.load(load_f)
     for key in load_dict:
         if key in all_file_paddle_list:
             ut_file_map_new[key] = load_dict[key]
 
-    with open("%s/build/ut_file_map.json" % rootPath, "w") as f:
+    with open(f"{rootPath}/build/ut_file_map.json", "w") as f:
         json.dump(ut_file_map_new, f, indent=4)
         print("remove_useless_file ut_file_map success!!")
 
 
 def handle_ut_file_map(rootPath):
     utNotSuccess_list = []
-    ut_map_path = "%s/build/ut_map" % rootPath
+    ut_map_path = f"{rootPath}/build/ut_map"
     files = os.listdir(ut_map_path)
     ut_file_map = {}
     count = 0
-    not_success_file = open("%s/build/prec_delta" % rootPath, 'w')
+    not_success_file = open(f"{rootPath}/build/prec_delta", 'w')
     # if testdir is not made,write the test into prec_delta
     get_all_uts(rootPath)
-    all_ut = '%s/build/all_uts_paddle' % rootPath
+    all_ut = f'{rootPath}/build/all_uts_paddle'
     with open(all_ut, 'r') as f:
         all_ut_list = []
         for ut in f.readlines():
@@ -73,7 +73,7 @@ def handle_ut_file_map(rootPath):
     for ut in all_ut_list:
         filedir = f'{rootPath}/build/ut_map/{ut}'
         if not os.path.exists(filedir):
-            not_success_file.write('%s\n' % ut)
+            not_success_file.write(f'{ut}\n')
             utNotSuccess_list.append(ut)
     # if fnda.tmp not exists,write the test into prec_delta
     for ut in files:
@@ -108,7 +108,7 @@ def handle_ut_file_map(rootPath):
                     ut_file_map[source_file].append(ut)
             f.close()
         else:
-            not_success_file.write('%s\n' % ut)
+            not_success_file.write(f'{ut}\n')
             utNotSuccess_list.append(ut)
     not_success_file.close()
 
@@ -135,13 +135,13 @@ def handle_ut_file_map(rootPath):
                 if source_file not in ut_file_map:
                     ut_file_map[source_file] = []
             f.close()
-    with open("%s/build/ut_file_map.json" % rootPath, "w") as f:
+    with open(f"{rootPath}/build/ut_file_map.json", "w") as f:
         json.dump(ut_file_map, f, indent=4)
 
 
 def notsuccessfuc(rootPath):
     utNotSuccess = ''
-    ut_map_path = "%s/build/ut_map" % rootPath
+    ut_map_path = f"{rootPath}/build/ut_map"
     files = os.listdir(ut_map_path)
     count = 0
 
@@ -154,7 +154,7 @@ def notsuccessfuc(rootPath):
             pass
         else:
             count = count + 1
-            utNotSuccess = utNotSuccess + '^%s$|' % ut
+            utNotSuccess = utNotSuccess + f'^{ut}$|'
 
     # ut not exec
 
@@ -166,7 +166,7 @@ def notsuccessfuc(rootPath):
         if ut not in files:
             print(ut)
             count = count + 1
-            utNotSuccess = utNotSuccess + '^%s$|' % ut
+            utNotSuccess = utNotSuccess + f'^{ut}$|'
 
     if utNotSuccess != '':
         print("utNotSuccess count: %s" % count)
@@ -176,18 +176,17 @@ def notsuccessfuc(rootPath):
 
 
 def ut_file_map_supplement(rootPath):
-    ut_file_map_new = "%s/build/ut_file_map.json" % rootPath
+    ut_file_map_new = f"{rootPath}/build/ut_file_map.json"
     precision_test_map_store_dir = "/precision_test_map_store"
-    os.system('mkdir %s' % precision_test_map_store_dir)
+    os.system(f'mkdir {precision_test_map_store_dir}')
     os.system(
-        'cd %s && wget --no-proxy https://paddle-docker-tar.bj.bcebos.com/tmp_test/ut_file_map.json --no-check-certificate'
-        % precision_test_map_store_dir
+        f'cd {precision_test_map_store_dir} && wget --no-proxy https://paddle-docker-tar.bj.bcebos.com/tmp_test/ut_file_map.json --no-check-certificate'
     )
-    ut_file_map_old = "%s/ut_file_map.json" % precision_test_map_store_dir
+    ut_file_map_old = f"{precision_test_map_store_dir}/ut_file_map.json"
     with open(ut_file_map_new, 'r') as load_f:
         load_dict_new = json.load(load_f)
 
-    all_uts_paddle = '%s/build/all_uts_paddle' % rootPath
+    all_uts_paddle = f'{rootPath}/build/all_uts_paddle'
 
     with open(all_uts_paddle, 'r') as f:
         all_uts_paddle_list = []
@@ -195,15 +194,14 @@ def ut_file_map_supplement(rootPath):
             all_uts_paddle_list.append(ut.strip())
         f.close()
 
-    with open("%s/ut_file_map.json" % precision_test_map_store_dir, "w") as f:
+    with open(f"{precision_test_map_store_dir}/ut_file_map.json", "w") as f:
         json.dump(load_dict_new, f, indent=4)
         print("load_dict_new success!!")
 
     os.system(
-        'cd %s && wget --no-proxy https://paddle-docker-tar.bj.bcebos.com/tmp_test/prec_delta --no-check-certificate'
-        % precision_test_map_store_dir
+        f'cd {precision_test_map_store_dir} && wget --no-proxy https://paddle-docker-tar.bj.bcebos.com/tmp_test/prec_delta --no-check-certificate'
     )
-    prec_delta_new = "%s/build/prec_delta" % rootPath
+    prec_delta_new = f"{rootPath}/build/prec_delta"
     with open(prec_delta_new, 'r') as f:
         prec_delta_new_list = []
         for ut in f.readlines():
@@ -212,7 +210,7 @@ def ut_file_map_supplement(rootPath):
     prec_delta_new_list.append(
         'test_py_reader_error_msg'
     )  # add a python case for pycoverage
-    prec_delta_file = open("%s/prec_delta" % precision_test_map_store_dir, 'w')
+    prec_delta_file = open(f"{precision_test_map_store_dir}/prec_delta", 'w')
     for ut in prec_delta_new_list:
         prec_delta_file.write(ut + '\n')
     print("prec_delta_file success!!")
@@ -220,7 +218,7 @@ def ut_file_map_supplement(rootPath):
 
 
 def utmap_analysis(rootPath):
-    ut_file_map_new = "%s/build/ut_file_map.json" % rootPath
+    ut_file_map_new = f"{rootPath}/build/ut_file_map.json"
     with open(ut_file_map_new, 'r') as load_f:
         load_dict_new = json.load(load_f)
     print(len(load_dict_new))
diff --git a/tools/gpups_test.sh b/tools/gpups_test.sh
index 07122405a21d7..e0669fb85e658 100644
--- a/tools/gpups_test.sh
+++ b/tools/gpups_test.sh
@@ -1,11 +1,11 @@
 # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -40,7 +40,7 @@ function get_quickly_disable_ut() {
     fi
 }
 
-# disable test: 
+# disable test:
 # test_dygraph_dataparallel_bf16
 # test_dygraph_sharding_stage2_bf16
 # test_dygraph_sharding_stage3_bf16
diff --git a/tools/group_case_for_parallel.py b/tools/group_case_for_parallel.py
index 66187ca4b0607..9af5e084bada2 100644
--- a/tools/group_case_for_parallel.py
+++ b/tools/group_case_for_parallel.py
@@ -40,29 +40,29 @@ def group_case_for_parallel(rootPath):
             )
 
     # get nightly tests
-    nightly_tests_file = open('%s/tools/nightly_case' % rootPath, 'r')
+    nightly_tests_file = open(f'{rootPath}/tools/nightly_case', 'r')
     nightly_tests = nightly_tests_file.read().strip().split('\n')
     nightly_tests_file.close()
 
     parallel_case_file_list = [
-        '%s/tools/single_card_tests_mem0' % rootPath,
-        '%s/tools/single_card_tests' % rootPath,
-        '%s/tools/multiple_card_tests_mem0' % rootPath,
-        '%s/tools/multiple_card_tests' % rootPath,
-        '%s/tools/exclusive_card_tests_mem0' % rootPath,
-        '%s/tools/exclusive_card_tests' % rootPath,
+        f'{rootPath}/tools/single_card_tests_mem0',
+        f'{rootPath}/tools/single_card_tests',
+        f'{rootPath}/tools/multiple_card_tests_mem0',
+        f'{rootPath}/tools/multiple_card_tests',
+        f'{rootPath}/tools/exclusive_card_tests_mem0',
+        f'{rootPath}/tools/exclusive_card_tests',
     ]
-    case_file = '%s/build/ut_list' % rootPath
+    case_file = f'{rootPath}/build/ut_list'
     if os.path.exists(case_file):
         f = open(case_file, 'r')
         all_need_run_cases = f.read().strip().split('\n')
         if len(all_need_run_cases) == 1 and all_need_run_cases[0] == '':
             f.close()
-            case_file = '%s/build/all_ut_list' % rootPath
+            case_file = f'{rootPath}/build/all_ut_list'
             f = open(case_file, 'r')
             all_need_run_cases = f.read().strip().split('\n')
     else:
-        case_file = '%s/build/all_ut_list' % rootPath
+        case_file = f'{rootPath}/build/all_ut_list'
         f = open(case_file, 'r')
         all_need_run_cases = f.read().strip().split('\n')
 
@@ -71,7 +71,7 @@ def group_case_for_parallel(rootPath):
     all_group_case = []
     for filename in parallel_case_file_list:
         fi = open(filename, 'r')
-        new_f = open('%s_new' % filename, 'w')
+        new_f = open(f'{filename}_new', 'w')
         lines = fi.readlines()
         new_case_file_list = []
         for line in lines:
@@ -88,7 +88,7 @@ def group_case_for_parallel(rootPath):
 
         for line in new_case_file_list:
             cases = '$|^'.join(case for case in line)
-            cases = '^job$|^%s$' % cases
+            cases = f'^job$|^{cases}$'
             new_f.write(cases + '\n')
         fi.close()
         new_f.close()
@@ -98,10 +98,10 @@ def group_case_for_parallel(rootPath):
     if len(all_need_run_cases) != 0:
         for case in all_need_run_cases:
             if case not in nightly_tests:
-                cases = cases + '$|^%s' % case
-        cases = '%s$' % cases
+                cases = cases + f'$|^{case}'
+        cases = f'{cases}$'
 
-    new_f = open('%s/tools/no_parallel_case_file' % rootPath, 'w')
+    new_f = open(f'{rootPath}/tools/no_parallel_case_file', 'w')
     new_f.write(cases + '\n')
     new_f.close()
     f.close()
diff --git a/tools/handle_h_cu_file.py b/tools/handle_h_cu_file.py
index 86458045d3de8..656e47fdba896 100644
--- a/tools/handle_h_cu_file.py
+++ b/tools/handle_h_cu_file.py
@@ -43,7 +43,7 @@ def threadPool(threadPoolNum):
 
 
 def get_h_file_md5(rootPath):
-    h_cu_files = '%s/tools/h_cu_files.log' % rootPath
+    h_cu_files = f'{rootPath}/tools/h_cu_files.log'
     f = open(h_cu_files)
     lines = f.readlines()
     for line in lines:
@@ -52,7 +52,7 @@ def get_h_file_md5(rootPath):
 
 
 def insert_pile_to_h_file(rootPath):
-    h_cu_files = '%s/tools/h_cu_files.log' % rootPath
+    h_cu_files = f'{rootPath}/tools/h_cu_files.log'
     f = open(h_cu_files)
     lines = f.readlines()
     for line in lines:
@@ -60,7 +60,7 @@ def insert_pile_to_h_file(rootPath):
         func = line.replace('/', '_').replace('.', '_')
         os.system(f'echo "\n#ifndef _PRECISE{func.upper()}_\n" >> {line}')
         os.system(f'echo "#define _PRECISE{func.upper()}_" >> {line}')
-        os.system('echo "\n#include <cstdio>\n" >> %s' % line)
+        os.system(f'echo "\n#include <cstdio>\n" >> {line}')
         os.system(
             f'echo "__attribute__((constructor)) static void calledFirst{func}()\n{{" >> {line}'
         )
@@ -68,43 +68,40 @@ def insert_pile_to_h_file(rootPath):
             'echo \'    fprintf(stderr,"precise test map fileeee: %%s\\\\n", __FILE__);\n}\' >> %s'
             % line
         )
-        os.system('echo "\n#endif" >> %s' % line)
+        os.system(f'echo "\n#endif" >> {line}')
 
 
 def add_simple_cxx_test(rootPath):
-    variant_test_path = '%s/paddle/utils/variant_test.cc' % rootPath
-    variant_test_cmakeflie_path = '%s/paddle/utils/CMakeLists.txt' % rootPath
+    variant_test_path = f'{rootPath}/paddle/utils/variant_test.cc'
+    variant_test_cmakeflie_path = f'{rootPath}/paddle/utils/CMakeLists.txt'
     if os.path.exists(variant_test_path) and os.path.exists(
         variant_test_cmakeflie_path
     ):
-        simple_test_path = '%s/paddle/utils/simple_precision_test.cc' % rootPath
-        os.system('touch %s' % simple_test_path)
+        simple_test_path = f'{rootPath}/paddle/utils/simple_precision_test.cc'
+        os.system(f'touch {simple_test_path}')
+        os.system(f"echo '#include \"gtest/gtest.h\"\n' >> {simple_test_path}")
         os.system(
-            "echo '#include \"gtest/gtest.h\"\n' >> %s" % simple_test_path
-        )
-        os.system(
-            'echo "TEST(interface_test, type) { }\n" >> %s' % simple_test_path
+            f'echo "TEST(interface_test, type) {{ }}\n" >> {simple_test_path}'
         )
         os.system('echo "cc_test(" >> %s' % variant_test_cmakeflie_path)
         os.system(
-            'echo "  simple_precision_test" >> %s' % variant_test_cmakeflie_path
+            f'echo "  simple_precision_test" >> {variant_test_cmakeflie_path}'
         )
         os.system(
-            'echo "  SRCS simple_precision_test.cc" >> %s'
-            % variant_test_cmakeflie_path
+            f'echo "  SRCS simple_precision_test.cc" >> {variant_test_cmakeflie_path}'
         )
-        os.system('echo "  DEPS gtest)\n" >> %s' % variant_test_cmakeflie_path)
+        os.system(f'echo "  DEPS gtest)\n" >> {variant_test_cmakeflie_path}')
 
 
 def remove_pile_from_h_file(rootPath):
-    h_cu_files = '%s/tools/h_cu_files.log' % rootPath
+    h_cu_files = f'{rootPath}/tools/h_cu_files.log'
     f = open(h_cu_files)
     lines = f.readlines()
     count = 12
     for line in lines:
         line = line.strip()
         while count > 0:
-            os.system("sed -i '$d' %s" % line)
+            os.system(f"sed -i '$d' {line}")
             count = count - 1
         count = 12
 
diff --git a/tools/nvcc_lazy.sh b/tools/nvcc_lazy.sh
index 31e1a44540133..bb851c11df6db 100755
--- a/tools/nvcc_lazy.sh
+++ b/tools/nvcc_lazy.sh
@@ -17,7 +17,7 @@
 echo "#!/usr/bin/env bash" >> $1
 echo "unset GREP_OPTIONS" >> $1
 echo "set -e" >> $1
-echo -e >> $1 
+echo -e >> $1
 echo "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved." >> $1
 echo "#" >> $1
 echo "# Licensed under the Apache License, Version 2.0 (the \"License\");" >> $1
@@ -25,7 +25,7 @@ echo "# you may not use this file except in compliance with the License." >> $1
 echo "# You may obtain a copy of the License at" >> $1
 echo "#" >> $1
 echo "#     http://www.apache.org/licenses/LICENSE-2.0" >> $1
-echo "#" >> $1 
+echo "#" >> $1
 echo "# Unless required by applicable law or agreed to in writing, software" >> $1
 echo "# distributed under the License is distributed on an \"AS IS\" BASIS," >> $1
 echo "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." >> $1
diff --git a/tools/print_signatures.py b/tools/print_signatures.py
index d09a04abd045c..ba3e08b154541 100644
--- a/tools/print_signatures.py
+++ b/tools/print_signatures.py
@@ -15,19 +15,34 @@
 Print all signature of a python module in alphabet order.
 
 Usage:
-    ./print_signature  "paddle.base" > signature.txt
+    python tools/print_signature.py "paddle" > API.spec
 """
 
+from __future__ import annotations
+
 import argparse
 import collections
 import hashlib
 import inspect
 import logging
 import pkgutil
+import re
 import sys
+from typing import Literal
 
 import paddle
 
+SpecFields = Literal[
+    "args",
+    "varargs",
+    "varkw",
+    "defaults",
+    "kwonlyargs",
+    "kwonlydefaults",
+    "annotations",
+    "document",
+]
+
 member_dict = collections.OrderedDict()
 
 visited_modules = set()
@@ -61,21 +76,6 @@ def md5(doc):
     return md5sum
 
 
-def is_primitive(instance):
-    int_types = (int,)
-    pritimitive_types = int_types + (float, str)
-    if isinstance(instance, pritimitive_types):
-        return True
-    elif isinstance(instance, (list, tuple, set)):
-        for obj in instance:
-            if not is_primitive(obj):
-                return False
-
-        return True
-    else:
-        return False
-
-
 ErrorSet = set()
 IdSet = set()
 skiplist = []
@@ -200,9 +200,7 @@ def insert_api_into_dict(full_name, gen_doc_anno=None):
             if gen_doc_anno:
                 api_info_dict[fc_id]["gen_doc_anno"] = gen_doc_anno
             if inspect.isfunction(obj):
-                api_info_dict[fc_id]["signature"] = repr(
-                    inspect.getfullargspec(obj)
-                ).replace('FullArgSpec', 'ArgSpec', 1)
+                api_info_dict[fc_id]["signature"] = inspect.getfullargspec(obj)
         return api_info_dict[fc_id]
 
 
@@ -239,85 +237,6 @@ def process_module(m, attr="__all__"):
     return api_counter
 
 
-def check_public_api():
-    modulelist = [  # npqa
-        paddle,
-        paddle.amp,
-        paddle.nn,
-        paddle.nn.functional,
-        paddle.nn.initializer,
-        paddle.nn.utils,
-        paddle.static,
-        paddle.static.nn,
-        paddle.io,
-        paddle.jit,
-        paddle.metric,
-        paddle.distribution,
-        paddle.optimizer,
-        paddle.optimizer.lr,
-        paddle.regularizer,
-        paddle.text,
-        paddle.utils,
-        paddle.utils.download,
-        paddle.utils.cpp_extension,
-        paddle.sysconfig,
-        paddle.vision,
-        paddle.vision.datasets,
-        paddle.vision.models,
-        paddle.vision.transforms,
-        paddle.vision.ops,
-        paddle.distributed,
-        paddle.distributed.fleet,
-        paddle.distributed.fleet.utils,
-        paddle.distributed.parallel,
-        paddle.distributed.utils,
-        paddle.callbacks,
-        paddle.hub,
-        paddle.autograd,
-        paddle.incubate,
-        paddle.inference,
-        paddle.onnx,
-        paddle.device,
-        paddle.audio,
-        paddle.audio.backends,
-        paddle.audio.datasets,
-        paddle.sparse,
-        paddle.sparse.nn,
-        paddle.sparse.nn.functional,
-    ]
-
-    apinum = 0
-    alldict = {}
-    for module in modulelist:
-        if hasattr(module, '__all__'):
-            old_all = module.__all__
-        else:
-            old_all = []
-            dirall = dir(module)
-            for item in dirall:
-                if item.startswith('__'):
-                    continue
-                old_all.append(item)
-        apinum += len(old_all)
-        alldict.update({module.__name__: old_all})
-
-    old_all = []
-    dirall = dir(paddle.Tensor)
-    for item in dirall:
-        if item.startswith('_'):
-            continue
-        old_all.append(item)
-    apinum += len(old_all)
-    alldict.update({'paddle.Tensor': old_all})
-
-    for module, allapi in alldict.items():
-        for member_name in allapi:
-            cur_name = module + '.' + member_name
-            instance = eval(cur_name)
-            doc_md5 = md5(instance.__doc__)
-            member_dict[cur_name] = f"({cur_name}, ('document', '{doc_md5}'))"
-
-
 def check_allmodule_callable():
     modulelist = [paddle]
     for m in modulelist:
@@ -326,69 +245,89 @@ def check_allmodule_callable():
     return member_dict
 
 
+class ApiSpecFormatter:
+    def __init__(self, show_fields: SpecFields):
+        self.show_fields = show_fields
+
+    def format_spec(self, spec: inspect.FullArgSpec | None) -> str:
+        if spec is None:
+            return "ArgSpec()"
+        inner_str = ", ".join(
+            f"{field}={getattr(spec, field)!r}"
+            for field in spec._fields
+            if field in self.show_fields
+        )
+        return f"ArgSpec({inner_str})"
+
+    def format_doc(self, doc: str) -> str:
+        if "document" not in self.show_fields:
+            return "('document', '**********')"
+        return f"('document', '{md5(doc)}')"
+
+    def format(self, api_name: str, spec: inspect.FullArgSpec, doc: str) -> str:
+        return f"{api_name} ({self.format_spec(spec)}, {self.format_doc(doc)})"
+
+
 def parse_args():
     """
     Parse input arguments
     """
     parser = argparse.ArgumentParser(description='Print Apis Signatures')
-    parser.add_argument('--debug', dest='debug', action="store_true")
+    parser.add_argument('module', type=str, help='module', default='paddle')
     parser.add_argument(
-        '--method',
-        dest='method',
+        '--skipped',
+        dest='skipped',
         type=str,
-        default='get_all_api',
-        help="using get_all_api or from_modulelist",
+        help='Skip Checking submodules, support regex',
+        default=r'paddle\.base\.libpaddle\.(eager|pir)\.ops',
     )
     parser.add_argument(
-        'module', type=str, help='module', default='paddle'
-    )  # not used
-    parser.add_argument(
-        '--skipped',
-        dest='skipped',
+        '--show-fields',
         type=str,
-        help='Skip Checking submodules',
-        default='paddle.base.libpaddle.eager.ops',
+        default="args,varargs,varkw,defaults,kwonlyargs,kwonlydefaults,annotations,document",
+        help="show fields in arg spec, separated by comma, e.g. 'args,varargs'",
     )
-
-    if len(sys.argv) == 1:
-        args = parser.parse_args(['paddle'])
-        return args
-    #    parser.print_help()
-    #    sys.exit(1)
-
     args = parser.parse_args()
     return args
 
 
+def create_api_filter(skipped_regex: str):
+    if not skipped_regex:
+        return lambda api_name: True
+    skipped_pattern = re.compile(skipped_regex)
+
+    def api_filter(api_name: str) -> bool:
+        return not skipped_pattern.match(api_name)
+
+    return api_filter
+
+
 if __name__ == '__main__':
     args = parse_args()
     check_allmodule_callable()
-    if args.method == 'from_modulelist':
-        check_public_api()
-        for name in member_dict:
-            print(name, member_dict[name])
-    elif args.method == 'get_all_api':
-        get_all_api()
-        all_api_names_to_k = {}
-        for k, api_info in api_info_dict.items():
-            # 1. the shortest suggested_name may be renamed;
-            # 2. some api's fullname is not accessable, the module name of it is overrided by the function with the same name;
-            api_name = sorted(api_info['all_names'])[0]
-            all_api_names_to_k[api_name] = k
-        all_api_names_sorted = sorted(all_api_names_to_k.keys())
-        for api_name in all_api_names_sorted:
-            if args.skipped != '' and api_name.find(args.skipped) >= 0:
-                continue
-            api_info = api_info_dict[all_api_names_to_k[api_name]]
-            print(
-                "{} ({}, ('document', '{}'))".format(
-                    api_name,
-                    api_info['signature']
-                    if 'signature' in api_info
-                    else 'ArgSpec()',
-                    md5(api_info['docstring']),
-                )
+    get_all_api(args.module)
+    api_filter = create_api_filter(args.skipped)
+    spec_formatter = ApiSpecFormatter(args.show_fields.split(','))
+
+    all_api_names_to_k = {}
+    for k, api_info in api_info_dict.items():
+        # 1. the shortest suggested_name may be renamed;
+        # 2. some api's fullname is not accessable, the module name of it is overrided by the function with the same name;
+        api_name = sorted(api_info['all_names'])[0]
+        all_api_names_to_k[api_name] = k
+    all_api_names_sorted = sorted(all_api_names_to_k.keys())
+    for api_name in all_api_names_sorted:
+        if not api_filter(api_name):
+            continue
+        api_info = api_info_dict[all_api_names_to_k[api_name]]
+
+        print(
+            spec_formatter.format(
+                api_name,
+                api_info.get('signature'),
+                api_info['docstring'],
             )
+        )
 
     if len(ErrorSet) == 0:
         sys.exit(0)
diff --git a/tools/prune_for_jetson.py b/tools/prune_for_jetson.py
index 12f15a5dec6e1..d3758493d0c00 100644
--- a/tools/prune_for_jetson.py
+++ b/tools/prune_for_jetson.py
@@ -101,9 +101,9 @@ def prune_phi_kernels():
 def apply_patches():
     work_path = os.path.dirname(os.path.abspath(__file__)) + "/../"
     ret = os.system(
-        "cd %s && rm -f paddle/fluid/inference/api/tensorrt_predictor.* "
+        f"cd {work_path} && rm -f paddle/fluid/inference/api/tensorrt_predictor.* "
         " && rm -f paddle/fluid/inference/api/paddle_tensorrt_predictor.h "
-        " && git apply tools/infer_prune_patches/*.patch && cd -" % work_path
+        " && git apply tools/infer_prune_patches/*.patch && cd -"
     )
     return ret == 0
 
@@ -120,7 +120,7 @@ def append_fluid_kernels():
     for op in op_white_list:
         append_str = (
             append_str
-            + "file(APPEND ${pybind_file} \"USE_OP__(%s);\\n\")\n" % op
+            + f"file(APPEND ${{pybind_file}} \"USE_OP__({op});\\n\")\n"
         )
 
     with open(file_name, 'r', encoding='utf-8') as f:
@@ -154,11 +154,9 @@ def append_fluid_kernels():
 
         for op in op_white_list:
             patterns = {
-                "REGISTER_OPERATOR": r"REGISTER_OPERATOR\(\s*%s\s*," % op,
-                "REGISTER_OP_CPU_KERNEL": r"REGISTER_OP_CPU_KERNEL\(\s*%s\s*,"
-                % op,
-                "REGISTER_OP_CUDA_KERNEL": r"REGISTER_OP_CUDA_KERNEL\(\s*%s\s*,"
-                % op,
+                "REGISTER_OPERATOR": rf"REGISTER_OPERATOR\(\s*{op}\s*,",
+                "REGISTER_OP_CPU_KERNEL": rf"REGISTER_OP_CPU_KERNEL\(\s*{op}\s*,",
+                "REGISTER_OP_CUDA_KERNEL": rf"REGISTER_OP_CUDA_KERNEL\(\s*{op}\s*,",
             }
             for k, p in patterns.items():
                 matches = re.findall(p, content, flags=re.DOTALL)
diff --git a/tools/sampcd_processor_utils.py b/tools/sampcd_processor_utils.py
index ff6de2b598326..aaf61fcd88dc0 100644
--- a/tools/sampcd_processor_utils.py
+++ b/tools/sampcd_processor_utils.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import argparse
 import inspect
 import logging
@@ -48,6 +50,12 @@
 API_DIFF_SPEC_FN = 'dev_pr_diff_api.spec'
 TEST_TIMEOUT = 10
 
+PAT_API_SPEC_MEMBER = re.compile(r'\((paddle[^,]+)\W*document\W*([0-9a-z]{32})')
+# insert ArgSpec for changing the API's type annotation can trigger the CI
+PAT_API_SPEC_SIGNATURE = re.compile(
+    r'^(paddle[^,]+)\s+\((ArgSpec.*),.*document\W*([0-9a-z]{32})'
+)
+
 
 class Result:
     # name/key for result
@@ -66,7 +74,7 @@ class Result:
     order: int = 0
 
     @classmethod
-    def msg(cls, count: int, env: typing.Set) -> str:
+    def msg(cls, count: int, env: set) -> str:
         """Message for logging with api `count` and running `env`."""
         raise NotImplementedError
 
@@ -85,8 +93,8 @@ class MetaResult(type):
     def __new__(
         mcs,
         name: str,
-        bases: typing.Tuple[type, ...],
-        namespace: typing.Dict[str, typing.Any],
+        bases: tuple[type, ...],
+        namespace: dict[str, typing.Any],
     ) -> type:
         cls = super().__new__(mcs, name, bases, namespace)
         if issubclass(cls, Result):
@@ -104,7 +112,7 @@ def get(mcs, name: str) -> type:
         return mcs.__cls_map.get(name)
 
     @classmethod
-    def cls_map(mcs) -> typing.Dict[str, Result]:
+    def cls_map(mcs) -> dict[str, Result]:
         return mcs.__cls_map
 
 
@@ -290,7 +298,7 @@ def prepare(self, test_capacity: set) -> None:
         """
         pass
 
-    def run(self, api_name: str, docstring: str) -> typing.List[TestResult]:
+    def run(self, api_name: str, docstring: str) -> list[TestResult]:
         """Extract codeblocks from docstring, and run the test.
         Run only one docstring at a time.
 
@@ -304,7 +312,7 @@ def run(self, api_name: str, docstring: str) -> typing.List[TestResult]:
         raise NotImplementedError
 
     def print_summary(
-        self, test_results: typing.List[TestResult], whl_error: typing.List[str]
+        self, test_results: list[TestResult], whl_error: list[str]
     ) -> None:
         """Post process test results and print test summary.
 
@@ -333,17 +341,17 @@ def get_api_md5(path):
     API_spec = os.path.abspath(os.path.join(os.getcwd(), "..", path))
     if not os.path.isfile(API_spec):
         return api_md5
-    pat = re.compile(r'\((paddle[^,]+)\W*document\W*([0-9a-z]{32})')
-    patArgSpec = re.compile(
-        r'^(paddle[^,]+)\s+\(ArgSpec.*document\W*([0-9a-z]{32})'
-    )
+
     with open(API_spec) as f:
         for line in f.readlines():
-            mo = pat.search(line)
-            if not mo:
-                mo = patArgSpec.search(line)
+            mo = PAT_API_SPEC_MEMBER.search(line)
+
             if mo:
                 api_md5[mo.group(1)] = mo.group(2)
+            else:
+                mo = PAT_API_SPEC_SIGNATURE.search(line)
+                api_md5[mo.group(1)] = f'{mo.group(2)}, {mo.group(3)}'
+
     return api_md5
 
 
@@ -397,18 +405,6 @@ def get_full_api_from_pr_spec():
         get_full_api_by_walk()
 
 
-def get_full_api():
-    """
-    get all the apis
-    """
-    global API_DIFF_SPEC_FN  # readonly
-    from print_signatures import get_all_api_from_modulelist
-
-    member_dict = get_all_api_from_modulelist()
-    with open(API_DIFF_SPEC_FN, 'w') as f:
-        f.write("\n".join(member_dict.keys()))
-
-
 def extract_code_blocks_from_docstr(docstr, google_style=True):
     """
     extract code-blocks from the given docstring.
@@ -599,9 +595,16 @@ def get_test_capacity(run_on_device="cpu"):
     return sample_code_test_capacity
 
 
-def get_docstring(full_test=False):
+def get_docstring(
+    full_test: bool = False,
+    filter_api: typing.Callable[[str], bool] | None = None,
+):
     '''
     this function will get the docstring for test.
+
+    Args:
+        full_test, get all api
+        filter_api, a function that filter api, if `True` then skip add to `docstrings_to_test`.
     '''
     import paddle
     import paddle.static.quantization  # noqa: F401
@@ -616,6 +619,9 @@ def get_docstring(full_test=False):
     with open(API_DIFF_SPEC_FN) as f:
         for line in f.readlines():
             api = line.replace('\n', '')
+            if filter_api is not None and filter_api(api.strip()):
+                continue
+
             try:
                 api_obj = eval(api)
             except AttributeError:
@@ -637,7 +643,7 @@ def get_docstring(full_test=False):
     return docstrings_to_test, whl_error
 
 
-def check_old_style(docstrings_to_test: typing.Dict[str, str]):
+def check_old_style(docstrings_to_test: dict[str, str]):
     old_style_apis = []
     for api_name, raw_docstring in docstrings_to_test.items():
         for codeblock in extract_code_blocks_from_docstr(
@@ -715,8 +721,8 @@ def exec_gen_doc():
 
 
 def get_test_results(
-    doctester: DocTester, docstrings_to_test: typing.Dict[str, str]
-) -> typing.List[TestResult]:
+    doctester: DocTester, docstrings_to_test: dict[str, str]
+) -> list[TestResult]:
     """Get test results from doctester with docstrings to test."""
     _test_style = (
         doctester.style
diff --git a/tools/statistics_UT_resource.sh b/tools/statistics_UT_resource.sh
index a6f1f264c4cd2..f97fc6f0dc51d 100644
--- a/tools/statistics_UT_resource.sh
+++ b/tools/statistics_UT_resource.sh
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/tools/test_print_signatures.py b/tools/test_print_signatures.py
index 8a3bc60dcf9a7..20345d77b2566 100644
--- a/tools/test_print_signatures.py
+++ b/tools/test_print_signatures.py
@@ -25,7 +25,7 @@
 import hashlib
 import unittest
 
-from print_signatures import is_primitive, md5
+from print_signatures import md5
 
 
 def func_example(param_a, param_b):
@@ -62,26 +62,5 @@ def test_md5(self):
         self.assertEqual(digest, md5(func_example.__doc__))
 
 
-class Test_is_primitive(unittest.TestCase):
-    def test_single(self):
-        self.assertTrue(is_primitive(2))
-        self.assertTrue(is_primitive(2.1))
-        self.assertTrue(is_primitive("2.1.1"))
-        self.assertFalse(is_primitive(b"hello paddle"))
-        self.assertFalse(is_primitive(1j))
-        self.assertTrue(is_primitive(True))
-
-    def test_collection(self):
-        self.assertTrue(is_primitive([]))
-        self.assertTrue(is_primitive(()))
-        self.assertTrue(is_primitive(set()))
-        self.assertTrue(is_primitive([1, 2]))
-        self.assertTrue(is_primitive((1.1, 2.2)))
-        self.assertTrue(is_primitive({1, 2.3}))
-        self.assertFalse(is_primitive(range(3)))
-        self.assertFalse(is_primitive({}))
-        self.assertFalse(is_primitive([1, 1j]))
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/tools/test_sampcd_processor.py b/tools/test_sampcd_processor.py
index 62c51a73ba8a7..c61c7e610f98c 100644
--- a/tools/test_sampcd_processor.py
+++ b/tools/test_sampcd_processor.py
@@ -103,19 +103,23 @@ def tearDown(self):
     def test_get_api_md5(self):
         res = get_api_md5('paddle/fluid/API_PR.spec')
         self.assertEqual(
-            "ff0f188c95030158cc6398d2a6c55one", res['paddle.one_plus_one']
+            "ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ff0f188c95030158cc6398d2a6c55one",
+            res['paddle.one_plus_one'],
         )
         self.assertEqual(
-            "ff0f188c95030158cc6398d2a6c55two", res['paddle.two_plus_two']
+            "ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ff0f188c95030158cc6398d2a6c55two",
+            res['paddle.two_plus_two'],
         )
         self.assertEqual(
-            "ff0f188c95030158cc6398d2a6cthree", res['paddle.three_plus_three']
+            "ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ff0f188c95030158cc6398d2a6cthree",
+            res['paddle.three_plus_three'],
         )
         self.assertEqual(
             "ff0f188c95030158cc6398d2a6c5four", res['paddle.four_plus_four']
         )
         self.assertEqual(
-            "ff0f188c95030158cc6398d2a6c5five", res['paddle.five_plus_five']
+            "ArgSpec(), ff0f188c95030158cc6398d2a6c5five",
+            res['paddle.five_plus_five'],
         )
 
 
@@ -302,8 +306,8 @@ def test_global_exec(self):
                     >>> import paddle
                     >>> a = paddle.to_tensor(.2)
                     >>> print(a)
-                    Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
-                    [0.20000000])
+                    Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    0.20000000)
             """,
             'set_default': """
             placeholder
@@ -319,8 +323,8 @@ def test_global_exec(self):
                     >>> paddle.set_default_dtype('float64')
                     >>> a = paddle.to_tensor(.2)
                     >>> print(a)
-                    Tensor(shape=[1], dtype=float64, place=Place(cpu), stop_gradient=True,
-                    [0.20000000])
+                    Tensor(shape=[], dtype=float64, place=Place(cpu), stop_gradient=True,
+                    0.20000000)
             """,
             'after_set_default': """
             placeholder
@@ -335,8 +339,8 @@ def test_global_exec(self):
                     >>> import paddle
                     >>> a = paddle.to_tensor(.2)
                     >>> print(a)
-                    Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
-                    [0.20000000])
+                    Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    0.20000000)
             """,
         }
 
@@ -509,10 +513,10 @@ def test_patch_xdoctest(self):
                     >>> import paddle
                     >>> paddle.device.set_device('gpu')
                     >>> a = paddle.to_tensor(.2)
-                    >>> # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True, [0.20000000])
+                    >>> # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True, 0.20000000)
                     >>> print(a)
-                    Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-                    [0.20000000])
+                    Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+                    0.20000000)
 
             """,
             'cpu_to_cpu': """
@@ -528,10 +532,10 @@ def test_patch_xdoctest(self):
                     >>> import paddle
                     >>> paddle.device.set_device('cpu')
                     >>> a = paddle.to_tensor(.2)
-                    >>> # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True, [0.20000000])
+                    >>> # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True, 0.20000000)
                     >>> print(a)
-                    Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
-                    [0.20000000])
+                    Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    0.20000000)
 
             """,
             'gpu_to_cpu': """
@@ -547,10 +551,10 @@ def test_patch_xdoctest(self):
                     >>> import paddle
                     >>> paddle.device.set_device('gpu')
                     >>> a = paddle.to_tensor(.2)
-                    >>> # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True, [0.20000000])
+                    >>> # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True, 0.20000000)
                     >>> print(a)
-                    Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
-                    [0.20000000])
+                    Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    0.20000000)
 
             """,
             'cpu_to_gpu': """
@@ -566,10 +570,10 @@ def test_patch_xdoctest(self):
                     >>> import paddle
                     >>> paddle.device.set_device('cpu')
                     >>> a = paddle.to_tensor(.2)
-                    >>> # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True, [0.20000000])
+                    >>> # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True, 0.20000000)
                     >>> print(a)
-                    Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-                    [0.20000000])
+                    Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+                    0.20000000)
             """,
             'gpu_to_cpu_array': """
             placeholder
@@ -701,8 +705,8 @@ def test_patch_xdoctest(self):
                     >>> paddle.device.set_device('gpu')
                     >>> a = paddle.to_tensor(.123456789)
                     >>> print(a)
-                    Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-                    [0.123456780])
+                    Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+                    0.123456780)
 
             """,
             'cpu_to_cpu': """
@@ -719,8 +723,8 @@ def test_patch_xdoctest(self):
                     >>> paddle.device.set_device('cpu')
                     >>> a = paddle.to_tensor(.123456789)
                     >>> print(a)
-                    Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
-                    [0.123456780])
+                    Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    0.123456780)
 
             """,
             'gpu_to_cpu': """
@@ -737,8 +741,8 @@ def test_patch_xdoctest(self):
                     >>> paddle.device.set_device('gpu')
                     >>> a = paddle.to_tensor(.123456789)
                     >>> print(a)
-                    Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
-                    [0.123456780])
+                    Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    0.123456780)
 
             """,
             'cpu_to_gpu': """
@@ -755,8 +759,8 @@ def test_patch_xdoctest(self):
                     >>> paddle.device.set_device('cpu')
                     >>> a = paddle.to_tensor(.123456789)
                     >>> print(a)
-                    Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-                    [0.123456780])
+                    Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+                    0.123456780)
             """,
             'gpu_to_cpu_array': """
             placeholder
@@ -2046,7 +2050,7 @@ def test_timeout(self):
 
     def test_bad_statements(self):
         docstrings_to_test = {
-            'bad_fluid': """
+            'good_fluid': """
             this is docstring...
 
             Examples:
@@ -2191,9 +2195,9 @@ def test_bad_statements(self):
             tr_10,
         ) = test_results
 
-        self.assertIn('bad_fluid', tr_0.name)
-        self.assertTrue(tr_0.badstatement)
-        self.assertFalse(tr_0.passed)
+        self.assertIn('good_fluid', tr_0.name)
+        self.assertFalse(tr_0.badstatement)
+        self.assertTrue(tr_0.passed)
 
         self.assertIn('bad_fluid_from', tr_1.name)
         self.assertTrue(tr_1.badstatement)
diff --git a/tools/test_type_checking.py b/tools/test_type_checking.py
new file mode 100644
index 0000000000000..714be765ca9b5
--- /dev/null
+++ b/tools/test_type_checking.py
@@ -0,0 +1,630 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from tools.type_checking import MypyChecker, get_test_results
+
+
+class TestMypyChecker(unittest.TestCase):
+    def test_mypy_pass(self):
+        docstrings_pass = {
+            'simple': """
+            placeholder
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> import abc
+                    >>> print(1)
+                    1
+            """,
+            'multi': """
+            placeholder
+
+            .. code-block:: python
+                :name: code-example-0
+
+                this is some blabla...
+
+                >>> # doctest: +SKIP('skip')
+                >>> print(1+1)
+                2
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> # doctest: -REQUIRES(env:GPU)
+                    >>> print(1-1)
+                    0
+
+                .. code-block:: python
+                    :name: code-example-2
+
+                    this is some blabla...
+
+                    >>> # doctest: +REQUIRES(env:GPU, env:XPU, env: DISTRIBUTED)
+                    >>> print(1-1)
+                    0
+            """,
+        }
+        docstrings_from_sampcd = {
+            'gpu_to_gpu': """
+            placeholder
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> import paddle
+                    >>> paddle.device.set_device('gpu')
+                    >>> a = paddle.to_tensor(.123456789)
+                    >>> print(a)
+                    Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+                    [0.123456780])
+
+            """,
+            'cpu_to_cpu': """
+            placeholder
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> import paddle
+                    >>> paddle.device.set_device('cpu')
+                    >>> a = paddle.to_tensor(.123456789)
+                    >>> print(a)
+                    Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    [0.123456780])
+
+            """,
+            'gpu_to_cpu': """
+            placeholder
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> import paddle
+                    >>> paddle.device.set_device('gpu')
+                    >>> a = paddle.to_tensor(.123456789)
+                    >>> print(a)
+                    Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    [0.123456780])
+
+            """,
+            'cpu_to_gpu': """
+            placeholder
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> import paddle
+                    >>> paddle.device.set_device('cpu')
+                    >>> a = paddle.to_tensor(.123456789)
+                    >>> print(a)
+                    Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+                    [0.123456780])
+            """,
+            'gpu_to_cpu_array': """
+            placeholder
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> import paddle
+                    >>> paddle.device.set_device('gpu')
+                    >>> a = paddle.to_tensor([[1.123456789 ,2,3], [2,3,4], [3,4,5]])
+                    >>> print(a)
+                    Tensor(shape=[3, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    [[1.123456780, 2., 3.],
+                    [2., 3., 4.],
+                    [3., 4., 5.]])
+            """,
+            'cpu_to_gpu_array': """
+            placeholder
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> import paddle
+                    >>> paddle.device.set_device('cpu')
+                    >>> a = paddle.to_tensor([[1.123456789,2,3], [2,3,4], [3,4,5]])
+                    >>> print(a)
+                    Tensor(shape=[3, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+                    [[1.123456780, 2., 3.],
+                    [2., 3., 4.],
+                    [3., 4., 5.]])
+            """,
+            'mass_array': """
+            placeholder
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> import paddle
+                    >>> paddle.device.set_device('gpu')
+                    >>> a = paddle.to_tensor(
+                    ... [[1.123456780, 2., -3, .3],
+                    ... [2, 3, +4., 1.2+10.34e-5j],
+                    ... [3, 5.e-3, 1e2, 3e-8]]
+                    ... )
+                    >>> # Tensor(shape=[3, 4], dtype=complex64, place=Place(gpu:0), stop_gradient=True,
+                    >>> #       [[ (1.1234568357467651+0j)                    ,
+                    >>> #          (2+0j)                                     ,
+                    >>> #         (-3+0j)                                     ,
+                    >>> #          (0.30000001192092896+0j)                   ],
+                    >>> #        [ (2+0j)                                     ,
+                    >>> #          (3+0j)                                     ,
+                    >>> #          (4+0j)                                     ,
+                    >>> #         (1.2000000476837158+0.00010340000153519213j)],
+                    >>> #        [ (3+0j)                                     ,
+                    >>> #          (0.004999999888241291+0j)                  ,
+                    >>> #          (100+0j)                                   ,
+                    >>> #          (2.999999892949745e-08+0j)                 ]])
+                    >>> print(a)
+                    Tensor(shape=[3, 4], dtype=complex64, place=Place(AAA), stop_gradient=True,
+                        [[ (1.123456+0j),
+                            (2+0j),
+                            (-3+0j),
+                            (0.3+0j)],
+                            [ (2+0j),
+                            (3+0j),
+                            (4+0j),
+                            (1.2+0.00010340j)],
+                            [ (3+0j),
+                            (0.00499999+0j),
+                            (100+0j),
+                            (2.999999e-08+0j)]])
+            """,
+            'float_array': """
+            placeholder
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> import paddle
+                    >>> paddle.device.set_device('cpu')
+                    >>> x = [[2, 3, 4], [7, 8, 9]]
+                    >>> x = paddle.to_tensor(x, dtype='float32')
+                    >>> print(paddle.log(x))
+                    Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    [[0.69314718, 1.09861231, 1.38629436],
+                        [1.94591010, 2.07944155, 2.19722462]])
+
+            """,
+            'float_array_diff': """
+            placeholder
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> import paddle
+                    >>> paddle.device.set_device('cpu')
+                    >>> x = [[2, 3, 4], [7, 8, 9]]
+                    >>> x = paddle.to_tensor(x, dtype='float32')
+                    >>> print(paddle.log(x))
+                    Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+                        [[0.69314712, 1.09861221, 1.386294],
+                        [1.94591032, 2.07944156, 2.1972246]])
+
+            """,
+            'float_begin': """
+            placeholder
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> print(7.0)
+                    7.
+
+            """,
+            'float_begin_long': """
+            placeholder
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> print(7.0000023)
+                    7.0000024
+
+            """,
+            'float_begin_more': """
+            placeholder
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> print(7.0, 5., 6.123456)
+                    7.0 5.0 6.123457
+
+            """,
+            'float_begin_more_diff': """
+            placeholder
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> print(7.0, 5., 6.123456)
+                    7.0 5.0 6.123457
+
+            """,
+            'float_begin_more_brief': """
+            placeholder
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> print(7.0, 5., 6.123456)
+                    7. 5. 6.123457
+
+            """,
+            'float_begin_fail': """
+            placeholder
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> print(7.0100023)
+                    7.0000024
+
+            """,
+        }
+        doctester = MypyChecker('../pyproject.toml')
+
+        test_results = get_test_results(doctester, docstrings_pass)
+        self.assertEqual(len(test_results), 3)
+
+        for tr in test_results:
+            self.assertFalse(tr.fail)
+
+        test_results = get_test_results(doctester, docstrings_from_sampcd)
+        self.assertEqual(len(test_results), 15)
+
+        for tr in test_results:
+            print(tr.msg)
+            self.assertFalse(tr.fail)
+
+    def test_mypy_fail(self):
+        docstrings_fail = {
+            'fail_simple': """
+            placeholder
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> import blabla
+            """,
+            'multi': """
+            placeholder
+
+            .. code-block:: python
+                :name: code-example-0
+
+                this is some blabla...
+
+                >>> # doctest: +SKIP('skip')
+                >>> print(1+1)
+                2
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> # doctest: -REQUIRES(env:GPU)
+                    >>> blabla
+                    >>> print(1-1)
+                    0
+
+                .. code-block:: python
+                    :name: code-example-2
+
+                    this is some blabla...
+
+                    >>> # doctest: +REQUIRES(env:GPU, env:XPU, env: DISTRIBUTED)
+                    >>> blabla
+                    >>> print(1-1)
+                    0
+            """,
+        }
+
+        doctester = MypyChecker('../pyproject.toml')
+
+        test_results = get_test_results(doctester, docstrings_fail)
+        self.assertEqual(len(test_results), 3)
+
+        for tr in test_results:
+            self.assertTrue(tr.fail)
+
+    def test_mypy_partial_fail(self):
+        docstrings_fail = {
+            'multi': """
+            placeholder
+
+            .. code-block:: python
+                :name: code-example-0
+
+                this is some blabla...
+
+                >>> # doctest: +SKIP('skip')
+                >>> print(1+1)
+                2
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> # doctest: -REQUIRES(env:GPU)
+                    >>> blabla
+                    >>> print(1-1)
+                    0
+
+                .. code-block:: python
+                    :name: code-example-2
+
+                    this is some blabla...
+
+                    >>> # doctest: +REQUIRES(env:GPU, env:XPU, env: DISTRIBUTED)
+                    >>> print(1-1)
+                    0
+            """
+        }
+
+        doctester = MypyChecker('../pyproject.toml')
+
+        test_results = get_test_results(doctester, docstrings_fail)
+        self.assertEqual(len(test_results), 2)
+
+        tr_0, tr_1 = test_results
+        self.assertTrue(tr_0.fail)
+        self.assertFalse(tr_1.fail)
+
+    def test_mypy_ignore(self):
+        docstrings_ignore = {
+            'fail_simple': """
+            placeholder
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> # type: ignore
+                    >>> import blabla
+            """,
+            'multi': """
+            placeholder
+
+            .. code-block:: python
+                :name: code-example-0
+
+                this is some blabla...
+
+                >>> # doctest: +SKIP('skip')
+                >>> print(1+1)
+                2
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> # type: ignore
+                    >>> # doctest: -REQUIRES(env:GPU)
+                    >>> blabla
+                    >>> print(1-1)
+                    0
+
+                .. code-block:: python
+                    :name: code-example-2
+
+                    this is some blabla...
+
+                    >>> # type: ignore
+                    >>> # doctest: +REQUIRES(env:GPU, env:XPU, env: DISTRIBUTED)
+                    >>> blabla
+                    >>> print(1-1)
+                    0
+            """,
+        }
+
+        doctester = MypyChecker('../pyproject.toml')
+
+        test_results = get_test_results(doctester, docstrings_ignore)
+        self.assertEqual(len(test_results), 3)
+
+        for tr in test_results:
+            print(tr.msg)
+            self.assertFalse(tr.fail)
+
+        docstrings_pass = {
+            'pass': """
+            placeholder
+
+            .. code-block:: python
+                :name: code-example-0
+
+                this is some blabla...
+
+                >>> # doctest: +SKIP('skip')
+                >>> print(1+1)
+                2
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> a = 1
+                    >>> # type: ignore
+                    >>> # doctest: -REQUIRES(env:GPU)
+                    >>> blabla
+                    >>> print(1-1)
+                    0
+
+                .. code-block:: python
+                    :name: code-example-2
+
+                    this is some blabla...
+
+                    >>> b = 2
+                    >>> # type: ignore
+                    >>> # doctest: +REQUIRES(env:GPU, env:XPU, env: DISTRIBUTED)
+                    >>> blabla
+                    >>> print(1-1)
+                    0
+            """,
+        }
+
+        doctester = MypyChecker('../pyproject.toml')
+
+        test_results = get_test_results(doctester, docstrings_pass)
+        self.assertEqual(len(test_results), 2)
+
+        for tr in test_results:
+            print(tr.msg)
+            self.assertFalse(tr.fail)
+
+        docstrings_fail = {
+            'fail': """
+            placeholder
+
+            .. code-block:: python
+                :name: code-example-0
+
+                this is some blabla...
+
+                >>> # doctest: +SKIP('skip')
+                >>> print(1+1)
+                2
+
+            Examples:
+
+                .. code-block:: python
+                    :name: code-example-1
+
+                    this is some blabla...
+
+                    >>> import blabla
+                    >>> a = 1
+                    >>> # type: ignore
+                    >>> # doctest: -REQUIRES(env:GPU)
+                    >>> blabla
+                    >>> print(1-1)
+                    0
+
+                .. code-block:: python
+                    :name: code-example-2
+
+                    this is some blabla...
+
+                    >>> import blabla
+                    >>> # type: ignore
+                    >>> # doctest: +REQUIRES(env:GPU, env:XPU, env: DISTRIBUTED)
+                    >>> blabla
+                    >>> print(1-1)
+                    0
+            """,
+        }
+
+        doctester = MypyChecker('../pyproject.toml')
+
+        test_results = get_test_results(doctester, docstrings_fail)
+        self.assertEqual(len(test_results), 2)
+
+        for tr in test_results:
+            print(tr.msg)
+            self.assertTrue(tr.fail)
diff --git a/tools/timeline.py b/tools/timeline.py
index ff8d0946378d7..5e16e0b9bf4f3 100644
--- a/tools/timeline.py
+++ b/tools/timeline.py
@@ -148,7 +148,7 @@ def _allocate_pids(self):
                         self._devices[(k, event.device_id, "CPU")] = pid
                         # -1 device id represents CUDA API(RunTime) call.(e.g. cudaLaunch, cudaMemcpy)
                         if event.device_id == -1:
-                            self._chrome_trace.emit_pid("%s:cuda_api" % k, pid)
+                            self._chrome_trace.emit_pid(f"{k}:cuda_api", pid)
                         else:
                             self._chrome_trace.emit_pid(
                                 "%s:cpu:block:%d" % (k, event.device_id), pid
diff --git a/tools/timeout_debug_help.sh b/tools/timeout_debug_help.sh
index 45de2db87e853..fcc6d473e49eb 100644
--- a/tools/timeout_debug_help.sh
+++ b/tools/timeout_debug_help.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -17,7 +17,7 @@ set +e
 failed_uts=$1
 need_debug_ut_re='test_dist_fleet'
 cat_log_judge=$(echo "${failed_uts}" | grep 'Timeout' |  grep -oEi "$need_debug_ut_re" )
-if [[ "$cat_log_judge" != "" ]];then 
+if [[ "$cat_log_judge" != "" ]];then
     echo "=============================================="
     echo "show timeout ut logs"
     echo "=============================================="
diff --git a/tools/type_checking.py b/tools/type_checking.py
new file mode 100644
index 0000000000000..78285cb87eaa4
--- /dev/null
+++ b/tools/type_checking.py
@@ -0,0 +1,276 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# We type-check the `Example` codes from docstring.
+
+from __future__ import annotations
+
+import argparse
+import doctest
+import pathlib
+import re
+from abc import abstractmethod
+from concurrent.futures import ProcessPoolExecutor
+from dataclasses import dataclass, field
+from typing import Any
+
+from mypy import api as mypy_api
+from sampcd_processor_utils import (
+    extract_code_blocks_from_docstr,
+    get_docstring,
+    init_logger,
+    log_exit,
+    logger,
+)
+
+
+class TypeChecker:
+    style: str = 'google'
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        pass
+
+    @abstractmethod
+    def run(self, api_name: str, codeblock: str) -> TestResult:
+        pass
+
+    @abstractmethod
+    def print_summary(
+        self, test_results: list[TestResult], whl_error: list[str]
+    ) -> None:
+        pass
+
+
+@dataclass
+class TestResult:
+    api_name: str
+    msg: str
+    fail: bool = False
+    extra_info: dict[str, Any] = field(default_factory=dict)
+
+
+class MypyChecker(TypeChecker):
+    def __init__(
+        self, config_file: str, cache_dir: str, *args: Any, **kwargs: Any
+    ) -> None:
+        self.config_file = config_file
+        self.cache_dir = cache_dir
+        super().__init__(*args, **kwargs)
+
+    def run(self, api_name: str, codeblock: str) -> TestResult:
+        # skip checking when the codeblock startswith `>>> # type: ignore`
+        codeblock_for_checking = []
+        for line in codeblock.splitlines():
+            if line.strip().startswith('>>> # type: ignore'):
+                break
+            codeblock_for_checking.append(line)
+        codeblock_for_checking = '\n'.join(codeblock_for_checking)
+
+        # remove `doctest` in the codeblock, or the module `doctest` cannot `get_examples`` correctly
+        codeblock_for_checking = re.sub(
+            r'#\s*x?doctest\s*:.*', '', codeblock_for_checking
+        )
+
+        # `get_examples` codes with `>>>` and `...` stripped
+        _example_code = doctest.DocTestParser().get_examples(
+            codeblock_for_checking
+        )
+        example_code = '\n'.join(
+            [l for e in _example_code for l in e.source.splitlines()]
+        )
+
+        normal_report, error_report, exit_status = mypy_api.run(
+            [
+                f'--config-file={self.config_file}',
+                f'--cache-dir={self.cache_dir}',
+                '-c',
+                example_code,
+            ]
+        )
+
+        logger.debug('-' * 20)
+        logger.debug(f'>>> Type hints with api {api_name} start ...')
+        logger.debug(example_code)
+        logger.debug('>>> Results ...')
+        logger.debug('>>> mypy normal_report is ...')
+        logger.debug(normal_report)
+        logger.debug('>>> mypy error_report is ...')
+        logger.debug(error_report)
+        logger.debug('>>> mypy exit_status is ...')
+        logger.debug(exit_status)
+        logger.debug(f'>>> Type hints with api {api_name} end...')
+
+        return TestResult(
+            api_name=api_name,
+            msg='\n'.join([normal_report, error_report]),
+            fail=exit_status != 0,
+            extra_info={
+                'normal_report': normal_report,
+                'error_report': error_report,
+                'exit_status': exit_status,
+            },
+        )
+
+    def print_summary(
+        self, test_results: list[TestResult], whl_error: list[str]
+    ) -> None:
+        is_fail = False
+
+        logger.warning("----------------Check results--------------------")
+
+        if whl_error is not None and whl_error:
+            logger.warning("%s is not in whl.", whl_error)
+            logger.warning("")
+            logger.warning("Please check the whl package and API_PR.spec!")
+            logger.warning(
+                "You can follow these steps in order to generate API.spec:"
+            )
+            logger.warning("1. cd ${paddle_path}, compile paddle;")
+            logger.warning(
+                "2. pip install build/python/dist/(build whl package);"
+            )
+            logger.warning(
+                "3. run 'python tools/print_signatures.py paddle > paddle/fluid/API.spec'."
+            )
+            for test_result in test_results:
+                if test_result.fail:
+                    logger.error(
+                        ">>> In addition, mistakes found in type checking: %s",
+                        test_result.api_name,
+                    )
+                    logger.error(test_result.msg)
+            log_exit(1)
+
+        else:
+            for test_result in test_results:
+                if test_result.fail:
+                    is_fail = True
+
+                    logger.error(test_result.api_name)
+                    logger.error(test_result.msg)
+
+                else:
+                    logger.debug(test_result.api_name)
+                    logger.debug(test_result.msg)
+
+            if is_fail:
+                logger.error(">>> Mistakes found in type checking!")
+                logger.error(">>> Please recheck the type annotations.")
+                log_exit(1)
+
+        logger.warning(">>> Type checking is successful!")
+        logger.warning("----------------End of the Check--------------------")
+
+
+def parse_args() -> argparse.Namespace:
+    """
+    Parse input arguments
+    """
+    parser = argparse.ArgumentParser(
+        description='run Sample Code Type Checking'
+    )
+    parser.add_argument('--debug', dest='debug', action="store_true")
+    parser.add_argument(
+        '--logf', dest='logf', type=str, default=None, help='file for logging'
+    )
+    parser.add_argument(
+        '--config-file',
+        dest='config_file',
+        type=str,
+        default=None,
+        help='config file for type checker',
+    )
+    parser.add_argument(
+        '--cache-dir',
+        dest='cache_dir',
+        type=str,
+        default=None,
+        help='cache dir for mypy',
+    )
+    parser.add_argument('--full-test', dest='full_test', action="store_true")
+
+    args = parser.parse_args()
+    return args
+
+
+def get_test_results(
+    type_checker: TypeChecker, docstrings_to_test: dict[str, str]
+) -> list[TestResult]:
+    _test_style = (
+        type_checker.style
+        if type_checker.style in {'google', 'freeform'}
+        else 'google'
+    )
+    google_style = _test_style == 'google'
+
+    api_names = []
+    codeblocks = []
+    for api_name, raw_docstring in docstrings_to_test.items():
+        # we may extract more than one codeblocks from docsting.
+        for codeblock in extract_code_blocks_from_docstr(
+            raw_docstring, google_style=google_style
+        ):
+            codeblock_name = codeblock['name']
+            codeblock_id = codeblock['id']
+
+            api_names.append(f'{api_name}:{codeblock_name or codeblock_id}')
+            codeblocks.append(codeblock['codes'])
+
+    test_results = []
+    with ProcessPoolExecutor() as exe:
+        test_results = exe.map(
+            type_checker.run, api_names, codeblocks, timeout=600
+        )
+
+    return list(test_results)
+
+
+def run_type_checker(
+    args: argparse.Namespace, type_checker: TypeChecker
+) -> None:
+    # init logger
+    init_logger(debug=args.debug, log_file=args.logf)
+
+    logger.info(
+        "----------------Codeblock Type Checking Start--------------------"
+    )
+
+    logger.info(">>> Get docstring from api ...")
+    filter_api = lambda api_name: 'libpaddle' in api_name
+    docstrings_to_test, whl_error = get_docstring(
+        full_test=args.full_test, filter_api=filter_api
+    )
+
+    logger.info(">>> Running type checker ...")
+    test_results = get_test_results(type_checker, docstrings_to_test)
+
+    logger.info(">>> Print summary ...")
+    type_checker.print_summary(test_results, whl_error)
+
+
+if __name__ == '__main__':
+    base_path = pathlib.Path(__file__).resolve().parent.parent
+
+    args = parse_args()
+    mypy_checker = MypyChecker(
+        config_file=(
+            args.config_file
+            if args.config_file
+            else (base_path / 'pyproject.toml')
+        ),
+        cache_dir=(
+            args.cache_dir if args.cache_dir else (base_path / '.mypy_cache')
+        ),
+    )
+    run_type_checker(args, mypy_checker)
diff --git a/tools/windows/build_compile_environment.bat b/tools/windows/build_compile_environment.bat
index 884cea8ca4cd0..016e2a4ff25cb 100644
--- a/tools/windows/build_compile_environment.bat
+++ b/tools/windows/build_compile_environment.bat
@@ -16,7 +16,7 @@
 :: Build Paddle compile environment
 :: ===============================
 :: Description:
-::   
+::
 ::   Install compile environment for xly CI.
 ::
 ::   Include:
@@ -55,7 +55,7 @@ if %errorlevel% == 0 (
 ) else (
   echo Error***** Download wget tool failed, please download it before rerun.
   exit /b 1
-) 
+)
 goto :eof
 :: ===== end step 0: wget tool =====
 
@@ -296,7 +296,7 @@ goto tensorrt
 echo There is not sccache in this PC, will install sccache.
 echo Download package from https://paddle-ci.gz.bcebos.com/window_requirement/sccache.exe
 wget -O sccache.exe "https://paddle-ci.gz.bcebos.com/window_requirement/sccache.exe"
-copy sccache.exe C:\Python38 /Y 
+copy sccache.exe C:\Python38 /Y
 goto :eof
 :: ===== end step 7: sccache on windows =====
 
diff --git a/tools/windows/check_change_of_unittest.sh b/tools/windows/check_change_of_unittest.sh
index 576f0e5d238ab..25073435e3fb2 100644
--- a/tools/windows/check_change_of_unittest.sh
+++ b/tools/windows/check_change_of_unittest.sh
@@ -19,7 +19,7 @@ GITHUB_API_TOKEN=$GITHUB_API_TOKEN
 GIT_PR_ID=$AGILE_PULL_ID
 BRANCH=$BRANCH
 if [ "${GITHUB_API_TOKEN}" == "" ] || [ "${GIT_PR_ID}" == "" ];then
-    exit 0 
+    exit 0
 fi
 
 unittest_spec_diff=$(cat $PADDLE_ROOT/deleted_ut | sed 's/^/ - /g')
diff --git a/tools/xpu/get_xpti_dependence.sh b/tools/xpu/get_xpti_dependence.sh
index 95cc4a110ed6d..6801990933d76 100644
--- a/tools/xpu/get_xpti_dependence.sh
+++ b/tools/xpu/get_xpti_dependence.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.