From d65b004a1bab5636d4395f33a19ca11629336255 Mon Sep 17 00:00:00 2001
From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com>
Date: Fri, 1 Mar 2024 18:48:04 +0800
Subject: [PATCH 01/15] [PIR] Set NCHW as default Layout for IrTensor (#62254)

* fix

* fix bug

* fix
---
 paddle/fluid/pir/dialect/operator/ir/ir_tensor.h |  2 +-
 paddle/phi/core/kernel_factory.cc                | 16 ++++++++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/pir/dialect/operator/ir/ir_tensor.h b/paddle/fluid/pir/dialect/operator/ir/ir_tensor.h
index e2c3229b04df05..21d8a9fdd7ae54 100644
--- a/paddle/fluid/pir/dialect/operator/ir/ir_tensor.h
+++ b/paddle/fluid/pir/dialect/operator/ir/ir_tensor.h
@@ -81,7 +81,7 @@ class IrTensor : public phi::TensorBase,
  private:
   phi::DDim dims_;
   phi::DataType dtype_{phi::DataType::FLOAT32};
-  phi::DataLayout layout_{phi::DataLayout::ANY};
+  phi::DataLayout layout_{phi::DataLayout::NCHW};
   LoD lod_;
   size_t offset_{0};
 };
diff --git a/paddle/phi/core/kernel_factory.cc b/paddle/phi/core/kernel_factory.cc
index 35ac9e1e0db956..7f1ee799824e8b 100644
--- a/paddle/phi/core/kernel_factory.cc
+++ b/paddle/phi/core/kernel_factory.cc
@@ -177,6 +177,22 @@ bool KernelFactory::HasKernel(const std::string& kernel_name,
       phi::errors::NotFound("The kernel `%s` is not registered.", kernel_name));
 
   auto kernel_iter = iter->second.find(kernel_key);
+  if (kernel_iter == iter->second.end() &&
+      kernel_key.layout() != phi::DataLayout::ALL_LAYOUT) {
+    phi::KernelKey any_layout_kernel_key(
+        kernel_key.backend(), phi::DataLayout::ALL_LAYOUT, kernel_key.dtype());
+    kernel_iter = iter->second.find(any_layout_kernel_key);
+  }
+
+#if defined(PADDLE_WITH_CUSTOM_DEVICE)
+  if (kernel_iter == iter->second.end() &&
+      kernel_key.backend() > phi::Backend::NUM_BACKENDS) {
+    kernel_iter = iter->second.find({phi::Backend::CUSTOM,
+                                     phi::DataLayout::ALL_LAYOUT,
+                                     kernel_key.dtype()});
+  }
+#endif
+
   if (kernel_iter == iter->second.end()) {
     return false;
   }

From 0cb9bf687a3372cf851089fd5508f4d7fafc1295 Mon Sep 17 00:00:00 2001
From: Sunny-bot1 <68891411+Sunny-bot1@users.noreply.github.com>
Date: Fri, 1 Mar 2024 19:29:08 +0800
Subject: [PATCH 02/15] [Inference] Add a config api to use PIR (#61968)

* add a config api for pir

* fix comment

* fix the enable failure

* fix bug

* fix bug
---
 paddle/fluid/inference/analysis/argument.h    |  1 +
 .../passes/inference_op_replace_pass.cc       |  4 +---
 .../ir_params_sync_among_devices_pass.cc      |  5 ++---
 paddle/fluid/inference/api/analysis_config.cc |  1 +
 .../fluid/inference/api/analysis_predictor.cc | 15 ++++++-------
 .../inference/api/demo_ci/custom_op_demo.cc   |  1 +
 paddle/fluid/inference/api/demo_ci/run.sh     |  2 +-
 paddle/fluid/inference/api/helper.cc          |  6 ++----
 paddle/fluid/inference/api/helper.h           |  2 +-
 .../inference/api/paddle_analysis_config.h    | 14 +++++++++++++
 paddle/fluid/pybind/inference_api.cc          |  2 ++
 .../cpp/inference/analysis/analyzer_tester.cc |  2 ++
 test/custom_op/test_inference_inplace.py      | 13 +++++-------
 test/ir/inference/auto_scan_test.py           |  4 ++--
 test/ir/inference/program_config.py           |  1 -
 .../inference/test_inference_predictor_run.py | 13 +++++-------
 .../test_decomp_inference_predictor_run.py    | 21 ++++++++-----------
 17 files changed, 57 insertions(+), 50 deletions(-)

diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index a87c919bbe2c1f..1407a8f875a297 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -227,6 +227,7 @@ struct Argument {
   DECL_ARGUMENT_FIELD(use_cutlass, UseCutlass, bool);
   DECL_ARGUMENT_FIELD(use_fc_padding, UseFcPadding, bool);
   DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int);
+  DECL_ARGUMENT_FIELD(use_pir, UsePIR, bool);
 
   // Usually use for trt dynamic shape.
   // TRT will select the best kernel according to opt shape
diff --git a/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.cc b/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.cc
index b422dea840af5f..993ab2e8618f47 100644
--- a/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.cc
@@ -16,14 +16,12 @@
 
 #include "paddle/fluid/inference/analysis/argument.h"
 
-COMMON_DECLARE_bool(enable_pir_in_executor);
-
 namespace paddle {
 namespace inference {
 namespace analysis {
 
 void InferenceOpReplacePass::RunImpl(Argument* argument) {
-  if (FLAGS_enable_pir_in_executor) {
+  if (argument->use_pir()) {
     return;
   }
 
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
index 2961d5c66f9f49..2e722f9a7e6e9e 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -32,8 +32,6 @@ PD_DEFINE_bool(  // NOLINT
     false,
     "Keep old mode for developers, the model is saved on cpu not device.");
 
-COMMON_DECLARE_bool(enable_pir_in_executor);
-
 namespace paddle {
 namespace inference {
 namespace analysis {
@@ -208,9 +206,10 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToXpu(Argument *argument) {
 #endif
 
 void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
-  if (FLAGS_enable_pir_in_executor) {
+  if (argument->use_pir()) {
     return;
   }
+
   PADDLE_ENFORCE_EQ(
       argument->scope_valid(),
       true,
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 5987483220b8ad..888e2cbe080c95 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -581,6 +581,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(skip_load_params_);
 
   CP_MEMBER(use_new_executor_);
+  CP_MEMBER(use_pir_);
 
   if (use_gpu_) {
     PADDLE_ENFORCE_EQ(use_xpu_,
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 9b05b9f78572e4..1cc723cd7913e8 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -134,7 +134,6 @@
 #include "paddle/fluid/pir/transforms/shape_optimization_pass.h"
 #include "paddle/pir/include/pass/pass_manager.h"
 
-COMMON_DECLARE_bool(enable_pir_in_executor);
 COMMON_DECLARE_bool(pir_apply_inplace_pass);
 
 namespace paddle {
@@ -376,7 +375,7 @@ AnalysisPredictor::AnalysisPredictor(const AnalysisConfig &config)
   }
   if (config_.new_executor_enabled()) {
     config_.EnableMemoryOptim(false);
-    if (FLAGS_enable_pir_in_executor) {
+    if (config_.new_ir_enabled()) {
       config_.SwitchIrOptim(false);
     }
   }
@@ -893,7 +892,7 @@ bool AnalysisPredictor::PrepareExecutor() {
     auto output_names = GetOutputNames();
     execution_config.skip_gc_vars.insert(output_names.begin(),
                                          output_names.end());
-    if (FLAGS_enable_pir_in_executor) {
+    if (config_.new_ir_enabled()) {
       pir_program_ = std::move(
           paddle::TranslateLegacyProgramToProgram(*inference_program_));
 
@@ -1715,6 +1714,7 @@ void AnalysisPredictor::PrepareArgument() {
   argument_->SetEnableIrOptim(config_.enable_ir_optim_);
   argument_->SetEnableMemoryOptim(config_.enable_memory_optim());
   argument_->SetModelFromMemory(config_.model_from_memory_);
+  argument_->SetUsePIR(config_.new_ir_enabled());
   // Analyze inference_program
   argument_->SetPredictorID(predictor_id_);
   argument_->SetRootPredictorID(root_predictor_id_);
@@ -1953,14 +1953,14 @@ void AnalysisPredictor::PrepareArgument() {
         model_precision_ == phi::DataType::FLOAT32) {
       argument_->SetEnableIrOptim(true);
       pass_builder->ClearPasses();
-      if (!FLAGS_enable_pir_in_executor) {
+      if (!config_.new_ir_enabled()) {
         pass_builder->AppendPass("map_op_to_another_pass");
         pass_builder->AppendPass("simplify_with_basic_ops_pass");
         pass_builder->AppendPass("is_test_pass");
         pass_builder->AppendPass("constant_folding_pass");
       }
       pass_builder->AppendPass("auto_mixed_precision_pass");
-      if (!FLAGS_enable_pir_in_executor) {
+      if (!config_.new_ir_enabled()) {
         pass_builder->AppendPass("inplace_op_var_pass");
       }
       LOG(INFO) << "This model run in GPU mixed precision mode with no ir "
@@ -2083,8 +2083,9 @@ CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
   // Register custom operators compiled by the user.
   // This function can only be executed once per process.
   static std::once_flag custom_operators_registered;
-  std::call_once(custom_operators_registered,
-                 []() { inference::RegisterAllCustomOperator(); });
+  std::call_once(custom_operators_registered, [config]() {
+    inference::RegisterAllCustomOperator(config.new_ir_enabled());
+  });
 
   auto SetGflags = [](const AnalysisConfig &config) {
     auto SetGflag = [](const char *name, const char *value) {
diff --git a/paddle/fluid/inference/api/demo_ci/custom_op_demo.cc b/paddle/fluid/inference/api/demo_ci/custom_op_demo.cc
index b4c8cccb8e7906..ec44238f008dc4 100644
--- a/paddle/fluid/inference/api/demo_ci/custom_op_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/custom_op_demo.cc
@@ -52,6 +52,7 @@ int main(int argc, char **argv) {
   config.SetModel(FLAGS_modeldir + "/custom_relu.pdmodel",
                   FLAGS_modeldir + "/custom_relu.pdiparams");
   config.EnableNewExecutor(true);
+  config.EnableNewIR(true);
   auto predictor{paddle_infer::CreatePredictor(config)};
   std::vector<int> input_shape = {1, 1, 28, 28};
   std::vector<float> input_data(1 * 1 * 28 * 28, 1);
diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh
index 795b414258b560..3de4fd3d0335ac 100755
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -301,7 +301,7 @@ for WITH_STATIC_LIB in ON OFF; do
         -DCUSTOM_OPERATOR_FILES=$CUSTOM_OPERATOR_FILES \
         -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
       make -j$(nproc)
-      FLAGS_enable_pir_in_executor=1 ./custom_op_demo \
+      ./custom_op_demo \
         --modeldir=$DATA_DIR/custom_op/custom_relu_infer_model
       if [ $? -ne 0 ]; then
         echo "custom_op_demo runs failed " >> ${current_dir}/test_summary.txt
diff --git a/paddle/fluid/inference/api/helper.cc b/paddle/fluid/inference/api/helper.cc
index e9eb090a771d25..80429055465ebb 100644
--- a/paddle/fluid/inference/api/helper.cc
+++ b/paddle/fluid/inference/api/helper.cc
@@ -22,8 +22,6 @@
 #include "paddle/phi/api/ext/op_meta_info.h"
 #include "paddle/pir/include/core/ir_context.h"
 
-COMMON_DECLARE_bool(enable_pir_in_executor);
-
 namespace paddle {
 namespace inference {
 
@@ -50,11 +48,11 @@ std::string to_string<std::vector<std::vector<float>>>(
   return ss.str();
 }
 
-void RegisterAllCustomOperator() {
+void RegisterAllCustomOperator(bool use_pir) {
   auto &op_meta_info_map = OpMetaInfoMap::Instance();
   const auto &meta_info_map = op_meta_info_map.GetMap();
   for (auto &pair : meta_info_map) {
-    if (FLAGS_enable_pir_in_executor) {
+    if (use_pir) {
       ::pir::IrContext *ctx = ::pir::IrContext::Instance();
       auto *custom_dialect =
           ctx->GetOrRegisterDialect<paddle::dialect::CustomOpDialect>();
diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
index 22a5319bb0dbc4..17ec8852b61df7 100644
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -431,7 +431,7 @@ static bool IsFileExists(const std::string &path) {
   return exists;
 }
 
-void RegisterAllCustomOperator();
+void RegisterAllCustomOperator(bool use_pir);
 
 void InitGflagsFromEnv();
 
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 134c0799ec663d..64b2de0eba3d4a 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -879,10 +879,22 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   int tensorrt_optimization_level() { return trt_optimization_level_; }
 
+  /// \brief A boolean state telling whether to use new executor.
+  ///
+  /// \return bool whether to use new executor.
+  ///
   void EnableNewExecutor(bool x = true) { use_new_executor_ = x; }
 
   bool new_executor_enabled() const { return use_new_executor_; }
 
+  /// \brief A boolean state telling whether to use new IR.
+  ///
+  /// \return bool whether to use new IR.
+  ///
+  void EnableNewIR(bool x = true) { use_pir_ = x; }
+
+  bool new_ir_enabled() const { return use_pir_; }
+
   ///
   /// \brief Control whether to use optimized model to inference.
   ///
@@ -1425,6 +1437,8 @@ struct PD_INFER_DECL AnalysisConfig {
   // PrepareProgram(). So we add this flag to control the process.
   bool apply_optim_{false};
   bool skip_load_params_{false};
+
+  bool use_pir_{false};
 };
 
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index 268806509031e2..708866b0bac348 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -869,6 +869,8 @@ void BindAnalysisConfig(py::module *m) {
       .def("enable_new_executor",
            &AnalysisConfig::EnableNewExecutor,
            py::arg("x") = true)
+      .def("enable_new_ir", &AnalysisConfig::EnableNewIR, py::arg("x") = true)
+      .def("new_ir_enabled", &AnalysisConfig::new_ir_enabled)
       .def("enable_profile", &AnalysisConfig::EnableProfile)
       .def("disable_glog_info", &AnalysisConfig::DisableGlogInfo)
       .def("glog_info_disabled", &AnalysisConfig::glog_info_disabled)
diff --git a/test/cpp/inference/analysis/analyzer_tester.cc b/test/cpp/inference/analysis/analyzer_tester.cc
index 611fd757c2bcf6..f4a8a0f7669b03 100644
--- a/test/cpp/inference/analysis/analyzer_tester.cc
+++ b/test/cpp/inference/analysis/analyzer_tester.cc
@@ -33,6 +33,7 @@ TEST(Analyzer, analysis_without_tensorrt) {
   argument.SetModelDir(FLAGS_inference_model_dir);
   argument.SetEnableIrOptim(false);
   argument.SetUseGPU(false);
+  argument.SetUsePIR(false);
   argument.SetAnalysisPasses({"ir_graph_build_pass",
                               "ir_analysis_pass",
                               "ir_params_sync_among_devices_pass"});
@@ -49,6 +50,7 @@ TEST(Analyzer, analysis_with_tensorrt) {
   argument.SetTensorRtWorkspaceSize(1 << 20);
   argument.SetModelDir(FLAGS_inference_model_dir);
   argument.SetUseGPU(false);
+  argument.SetUsePIR(false);
   argument.SetAnalysisPasses({"ir_graph_build_pass",
                               "ir_analysis_pass",
                               "ir_params_sync_among_devices_pass"});
diff --git a/test/custom_op/test_inference_inplace.py b/test/custom_op/test_inference_inplace.py
index 303b2b21d15dc8..64219d8e148d00 100644
--- a/test/custom_op/test_inference_inplace.py
+++ b/test/custom_op/test_inference_inplace.py
@@ -83,10 +83,7 @@ def setUp(self):
     def tearDown(self):
         self.temp_dir.cleanup()
 
-    def enable_pir(self, flag: bool):
-        paddle.set_flags({'FLAGS_enable_pir_in_executor': flag})
-
-    def init_predictor(self):
+    def init_predictor(self, use_pir: bool):
         config = Config(
             os.path.join(
                 self.temp_dir.name,
@@ -100,6 +97,8 @@ def init_predictor(self):
         config.enable_use_gpu(256, 0)
         config.switch_ir_optim(False)
         config.enable_new_executor()
+        if use_pir:
+            config.enable_new_ir()
         predictor = create_predictor(config)
         return predictor
 
@@ -123,11 +122,9 @@ def get_outputs(self, predictor):
         return outputs[0]
 
     def test_output(self):
-        self.enable_pir(True)
-        pir_predictor = self.init_predictor()
+        pir_predictor = self.init_predictor(True)
         pir_output = self.get_outputs(pir_predictor)
-        self.enable_pir(False)
-        predictor = self.init_predictor()
+        predictor = self.init_predictor(False)
         output = self.get_outputs(predictor)
         np.testing.assert_allclose(
             output.numpy().flatten(), pir_output.numpy().flatten()
diff --git a/test/ir/inference/auto_scan_test.py b/test/ir/inference/auto_scan_test.py
index b26725314fb1f9..02bd28d7139f97 100755
--- a/test/ir/inference/auto_scan_test.py
+++ b/test/ir/inference/auto_scan_test.py
@@ -352,13 +352,13 @@ def run_test_config(
         """
         Test a single case.
         """
-        paddle.set_flags({'FLAGS_enable_pir_in_executor': True})
+        pred_config.enable_new_ir(True)
         pred_config.switch_ir_optim(False)
         pred_config.enable_new_executor()
         result = super().run_test_config(
             model, params, prog_config, pred_config, feed_data
         )
-        paddle.set_flags({'FLAGS_enable_pir_in_executor': False})
+        pred_config.enable_new_ir(False)
         return result
 
 
diff --git a/test/ir/inference/program_config.py b/test/ir/inference/program_config.py
index f3d44361260f94..f64335fc4379e8 100644
--- a/test/ir/inference/program_config.py
+++ b/test/ir/inference/program_config.py
@@ -346,7 +346,6 @@ def _cast(self) -> None:
 
 def create_fake_model(program_config):
     '''Create a Paddle model(in memory) according to the given config.'''
-    paddle.set_flags({'FLAGS_enable_pir_in_executor': False})
     program_config = copy.deepcopy(program_config)
     program_config._cast()
     paddle.enable_static()
diff --git a/test/ir/inference/test_inference_predictor_run.py b/test/ir/inference/test_inference_predictor_run.py
index 1d8abc174f1cf1..21b095d7974426 100644
--- a/test/ir/inference/test_inference_predictor_run.py
+++ b/test/ir/inference/test_inference_predictor_run.py
@@ -62,10 +62,7 @@ def setUp(self):
     def tearDown(self):
         self.temp_dir.cleanup()
 
-    def enable_pir(self, flag: bool):
-        paddle.set_flags({'FLAGS_enable_pir_in_executor': flag})
-
-    def init_predictor(self):
+    def init_predictor(self, use_pir: bool):
         config = Config(
             os.path.join(
                 self.temp_dir.name,
@@ -80,6 +77,8 @@ def init_predictor(self):
         config.switch_ir_optim(False)
         # config.enable_memory_optim()
         config.enable_new_executor()
+        if use_pir:
+            config.enable_new_ir()
         predictor = create_predictor(config)
         return predictor
 
@@ -117,11 +116,9 @@ def get_inorder_output(self, predictor):
         return outputs[0]
 
     def test_output(self):
-        self.enable_pir(False)
-        predictor = self.init_predictor()
+        predictor = self.init_predictor(False)
         output = self.get_inorder_output(predictor)
-        self.enable_pir(True)
-        pir_predictor = self.init_predictor()
+        pir_predictor = self.init_predictor(True)
         pir_output = self.get_disorder_output(pir_predictor)
 
         np.testing.assert_allclose(
diff --git a/test/ir/pir/cinn/symbolic/test_decomp_inference_predictor_run.py b/test/ir/pir/cinn/symbolic/test_decomp_inference_predictor_run.py
index 0a9c091f05ee7f..517cd7083288a9 100644
--- a/test/ir/pir/cinn/symbolic/test_decomp_inference_predictor_run.py
+++ b/test/ir/pir/cinn/symbolic/test_decomp_inference_predictor_run.py
@@ -68,10 +68,7 @@ def setUp(self):
     def tearDown(self):
         self.temp_dir.cleanup()
 
-    def enable_pir(self, flag: bool):
-        paddle.set_flags({'FLAGS_enable_pir_in_executor': flag})
-
-    def init_predictor(self):
+    def init_predictor(self, use_pir: bool):
         config = Config(
             os.path.join(
                 self.temp_dir.name,
@@ -86,6 +83,8 @@ def init_predictor(self):
             config.enable_use_gpu(256, 0)
         config.switch_ir_optim(False)
         config.enable_new_executor()
+        if use_pir:
+            config.enable_new_ir()
         predictor = create_predictor(config)
         return predictor
 
@@ -118,12 +117,11 @@ def get_inorder_output(self, predictor):
         return outputs[0]
 
     def test_output_prim_inorder(self):
-        self.enable_pir(False)
-        predictor = self.init_predictor()
+        predictor = self.init_predictor(False)
         output = self.get_inorder_output(predictor)
-        self.enable_pir(True)
+        paddle.set_flags({'FLAGS_enable_pir_in_executor': True})
         paddle.core._set_prim_all_enabled(True)
-        pir_predictor = self.init_predictor()
+        pir_predictor = self.init_predictor(True)
         pir_output = self.get_inorder_output(pir_predictor)
         paddle.core._set_prim_all_enabled(False)
 
@@ -135,12 +133,11 @@ def test_output_prim_inorder(self):
         )
 
     def test_output_prim_disorder(self):
-        self.enable_pir(False)
-        predictor = self.init_predictor()
+        predictor = self.init_predictor(False)
         output = self.get_disorder_output(predictor)
-        self.enable_pir(True)
+        paddle.set_flags({'FLAGS_enable_pir_in_executor': True})
         paddle.core._set_prim_all_enabled(True)
-        pir_predictor = self.init_predictor()
+        pir_predictor = self.init_predictor(True)
         pir_output = self.get_disorder_output(pir_predictor)
         paddle.core._set_prim_all_enabled(False)
 

From a77172c4dae94550a27d4e620f77b7222556ac31 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Fri, 1 Mar 2024 23:12:35 +0800
Subject: [PATCH 03/15] Fix tensor_comsumer tensor_consumer,etc (#62213)

---
 paddle/fluid/pir/drr/src/attr_type_uilts.h    |  6 ++---
 .../fluid/pir/drr/src/ir_operation_factory.cc | 24 +++++++++----------
 paddle/fluid/pir/drr/src/pattern_graph.cc     | 20 ++++++++--------
 paddle/fluid/pir/drr/src/rewrite_pattern.cc   |  2 +-
 4 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/paddle/fluid/pir/drr/src/attr_type_uilts.h b/paddle/fluid/pir/drr/src/attr_type_uilts.h
index 02f5a4defc1551..a48ed382a7d19d 100644
--- a/paddle/fluid/pir/drr/src/attr_type_uilts.h
+++ b/paddle/fluid/pir/drr/src/attr_type_uilts.h
@@ -48,7 +48,7 @@ PD_SPECIALIZE_CppTypeToIrAttribute(phi::IntArray,
                                    paddle::dialect::IntArrayAttribute);
 
 template <typename T>
-struct IrAttrbuteCreator {
+struct IrAttributeCreator {
   typename CppTypeToIrAttribute<T>::type operator()(T obj) const {
     return CppTypeToIrAttribute<T>::type::template get(
         pir::IrContext::Instance(), obj);
@@ -56,7 +56,7 @@ struct IrAttrbuteCreator {
 };
 
 template <>
-struct IrAttrbuteCreator<std::vector<int32_t>> {
+struct IrAttributeCreator<std::vector<int32_t>> {
   pir::ArrayAttribute operator()(std::vector<int32_t> obj) const {
     std::vector<pir::Attribute> attr_vec;
     attr_vec.reserve(obj.size());
@@ -69,7 +69,7 @@ struct IrAttrbuteCreator<std::vector<int32_t>> {
 };
 
 template <>
-struct IrAttrbuteCreator<std::vector<float>> {
+struct IrAttributeCreator<std::vector<float>> {
   pir::ArrayAttribute operator()(std::vector<float> obj) const {
     std::vector<pir::Attribute> attr_vec;
     attr_vec.reserve(obj.size());
diff --git a/paddle/fluid/pir/drr/src/ir_operation_factory.cc b/paddle/fluid/pir/drr/src/ir_operation_factory.cc
index 61c12c281e1398..bfe97d45592f72 100644
--- a/paddle/fluid/pir/drr/src/ir_operation_factory.cc
+++ b/paddle/fluid/pir/drr/src/ir_operation_factory.cc
@@ -65,33 +65,33 @@ void OperationFactory::RegisterManualOpCreator() {
 
 pir::Attribute CreateIrAttribute(const std::any& obj) {
   if (obj.type() == typeid(bool)) {
-    return IrAttrbuteCreator<bool>()(std::any_cast<bool>(obj));
+    return IrAttributeCreator<bool>()(std::any_cast<bool>(obj));
   } else if (obj.type() == typeid(int32_t)) {
-    return IrAttrbuteCreator<int32_t>()(std::any_cast<int32_t>(obj));
+    return IrAttributeCreator<int32_t>()(std::any_cast<int32_t>(obj));
   } else if (obj.type() == typeid(int64_t)) {
-    return IrAttrbuteCreator<int64_t>()(std::any_cast<int64_t>(obj));
+    return IrAttributeCreator<int64_t>()(std::any_cast<int64_t>(obj));
   } else if (obj.type() == typeid(float)) {
-    return IrAttrbuteCreator<float>()(std::any_cast<float>(obj));
+    return IrAttributeCreator<float>()(std::any_cast<float>(obj));
   } else if (obj.type() == typeid(std::string)) {
-    return IrAttrbuteCreator<std::string>()(std::any_cast<std::string>(obj));
+    return IrAttributeCreator<std::string>()(std::any_cast<std::string>(obj));
   } else if (obj.type() == typeid(const char*)) {
-    return IrAttrbuteCreator<std::string>()(std::any_cast<const char*>(obj));
+    return IrAttributeCreator<std::string>()(std::any_cast<const char*>(obj));
   } else if (obj.type() == typeid(phi::DataType)) {
-    return IrAttrbuteCreator<phi::DataType>()(
+    return IrAttributeCreator<phi::DataType>()(
         std::any_cast<phi::DataType>(obj));
   } else if (obj.type() == typeid(phi::Place)) {
-    return IrAttrbuteCreator<phi::Place>()(std::any_cast<phi::Place>(obj));
+    return IrAttributeCreator<phi::Place>()(std::any_cast<phi::Place>(obj));
   } else if (obj.type() == typeid(std::vector<int32_t>)) {  // NOLINT
-    return IrAttrbuteCreator<std::vector<int32_t>>()(
+    return IrAttributeCreator<std::vector<int32_t>>()(
         std::any_cast<std::vector<int32_t>>(obj));
   } else if (obj.type() == typeid(std::vector<int64_t>)) {
-    return IrAttrbuteCreator<std::vector<int64_t>>()(
+    return IrAttributeCreator<std::vector<int64_t>>()(
         std::any_cast<std::vector<int64_t>>(obj));
   } else if (obj.type() == typeid(std::vector<float>)) {
-    return IrAttrbuteCreator<std::vector<float>>()(
+    return IrAttributeCreator<std::vector<float>>()(
         std::any_cast<std::vector<float>>(obj));
   } else if (obj.type() == typeid(phi::IntArray)) {
-    return IrAttrbuteCreator<phi::IntArray>()(
+    return IrAttributeCreator<phi::IntArray>()(
         std::any_cast<phi::IntArray>(obj));
   } else {
     PADDLE_THROW(
diff --git a/paddle/fluid/pir/drr/src/pattern_graph.cc b/paddle/fluid/pir/drr/src/pattern_graph.cc
index eccbb30dea8906..be57150ed8ffdd 100644
--- a/paddle/fluid/pir/drr/src/pattern_graph.cc
+++ b/paddle/fluid/pir/drr/src/pattern_graph.cc
@@ -148,7 +148,7 @@ void GraphTopo::WalkGraphNodesTopoOrder(
       graph_->input_tensors();
   const std::unordered_map<std::string, std::shared_ptr<Tensor>>
       &id2owned_tensor = graph_->id2owned_tensor();
-  const std::vector<std::shared_ptr<OpCall>> &owend_opcall =
+  const std::vector<std::shared_ptr<OpCall>> &owned_opcall =
       graph_->owned_op_call();
 
   std::queue<const OpCall *> opcall_queue;
@@ -156,7 +156,7 @@ void GraphTopo::WalkGraphNodesTopoOrder(
       opcall_dependent;
 
   // init opcall_dependent
-  for (const std::shared_ptr<OpCall> &opcall_sptr : owend_opcall) {
+  for (const std::shared_ptr<OpCall> &opcall_sptr : owned_opcall) {
     if (opcall_sptr.get()->inputs().empty()) {  // opcall inputs is empty
       opcall_queue.push(opcall_sptr.get());
     } else {
@@ -174,11 +174,11 @@ void GraphTopo::WalkGraphNodesTopoOrder(
                                             "The input tensor [%s] must exists "
                                             "in pattern graph to be obtained.",
                                             tensor_name));
-    for (const auto &tensor_comsumer :
+    for (const auto &tensor_consumer :
          id2owned_tensor.at(tensor_name).get()->consumers()) {
-      opcall_dependent[tensor_comsumer].erase(tensor_name);
-      if (opcall_dependent[tensor_comsumer].empty()) {
-        opcall_queue.push(tensor_comsumer);
+      opcall_dependent[tensor_consumer].erase(tensor_name);
+      if (opcall_dependent[tensor_consumer].empty()) {
+        opcall_queue.push(tensor_consumer);
       }
     }
   }
@@ -190,10 +190,10 @@ void GraphTopo::WalkGraphNodesTopoOrder(
 
     // update opcall_dependent
     for (const auto &output_tensor : opcall->outputs()) {
-      for (const auto &tensor_comsumer : output_tensor->consumers()) {
-        opcall_dependent[tensor_comsumer].erase(output_tensor->name());
-        if (opcall_dependent[tensor_comsumer].empty()) {
-          opcall_queue.push(tensor_comsumer);
+      for (const auto &tensor_consumer : output_tensor->consumers()) {
+        opcall_dependent[tensor_consumer].erase(output_tensor->name());
+        if (opcall_dependent[tensor_consumer].empty()) {
+          opcall_queue.push(tensor_consumer);
         }
       }
     }
diff --git a/paddle/fluid/pir/drr/src/rewrite_pattern.cc b/paddle/fluid/pir/drr/src/rewrite_pattern.cc
index 04390126ddddf8..46b034aca85582 100644
--- a/paddle/fluid/pir/drr/src/rewrite_pattern.cc
+++ b/paddle/fluid/pir/drr/src/rewrite_pattern.cc
@@ -59,7 +59,7 @@ bool DrrRewritePattern::MatchAndRewrite(
   if (PatternGraphMatch(op, src_match_ctx.get())) {
     VLOG(4) << "DRR pattern (" << pattern_name_ << ") is matched in program.";
     PatternGraphRewrite(*src_match_ctx, rewriter);
-    VLOG(4) << "DRR pattern (" << pattern_name_ << ") is rewrited in program.";
+    VLOG(4) << "DRR pattern (" << pattern_name_ << ") is rewritten in program.";
     return true;
   }
   return false;

From 78254af04977586d0be32f8129236feefb9663c9 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Fri, 1 Mar 2024 23:13:54 +0800
Subject: [PATCH 04/15]  Fix Unexpceted Unexpected, etc (#62260)

---
 .../fast_threaded_ssa_graph_executor.cc       |  4 ++--
 .../framework/details/fetch_op_handle.cc      |  2 +-
 paddle/fluid/framework/operator.cc            | 10 +++++-----
 paddle/fluid/framework/parallel_executor.cc   | 10 +++++-----
 paddle/fluid/framework/tensor_util.cc         |  8 +++++---
 paddle/fluid/framework/trainer_factory.cc     |  4 ++--
 paddle/fluid/operators/cvm_op.cc              |  2 +-
 paddle/fluid/platform/float16_test.cu         |  2 +-
 .../fluid/prim/api/manual_prim/utils/utils.h  |  6 +++---
 paddle/phi/kernels/prior_box_kernel.h         | 20 +++++++++----------
 10 files changed, 35 insertions(+), 33 deletions(-)

diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
index 19cf30d24db406..66c62085faed2b 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -49,8 +49,8 @@ FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
       /*disable_setting_default_stream_for_allocator=*/true,
       /*stream_priority=*/0);
   if (ir::IsTopologySortOperationsUnique(*graph_)) {
-    VLOG(10)
-        << "Change thread number to 1 because the toposort order is unique";
+    VLOG(10) << "Change thread number to 1 because the topology sort order is "
+                "unique";
     strategy_.num_threads_ = 1;
     traced_ops_.clear();
     for (auto *op_node : TopologySortOperations(*graph_)) {
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index 27be4b77176350..25108148af3494 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -39,7 +39,7 @@ FetchOpHandle::~FetchOpHandle() = default;
 
 void FetchOpHandle::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
   PADDLE_THROW(platform::errors::PermissionDenied(
-      "No nodes need to wait FetchOp. Unexpceted Error."));
+      "No nodes need to wait FetchOp. Unexpected Error."));
 }
 
 static void CheckDims(const framework::DDim &tensor_dims,
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 55fc19ad2be1c3..afe442c0a7c6f3 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -2038,7 +2038,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
       phi::KernelContext phi_kernel_context;
       if (enable_cache_runtime_context_ && !need_prepare_phi_data_ &&
           !need_prepare_data_) {
-        // TODO(inference): Now we only suppor dense_tensor cache, we may be
+        // TODO(inference): Now we only support dense_tensor cache, we may be
         // support ScalarTensor, SparseTensor in future.
         bool all_dense_tensor_input_{true};
         for (auto& iter : Inputs()) {
@@ -2573,7 +2573,7 @@ Scope* OperatorWithKernel::PrepareData(
         // for some situation like InferShape().
         // In this situation We cannot skip Var analysis, as
         // oneDNN shape of Var may differ from kNHWC Var
-        // In such situation corressponding resized Var
+        // In such situation corresponding resized Var
         // has to be created and registered
         if ((tensor_in->layout() == DataLayout::ONEDNN) &&
             (var->IsType<phi::DenseTensor>() == true) &&
@@ -3193,7 +3193,7 @@ void OperatorWithKernel::BuildPhiKernelContext(
   for (size_t i = 0; i < input_names.size(); ++i) {
     auto it = ctx.inputs.find(input_names[i]);
 
-    // calcute the start and end index of the input tensors
+    // calculate the start and end index of the input tensors
     size_t start_idx =
         (i == 0 ? 0 : phi_kernel_context->InputRangeAt(i - 1).second);
     // deal with optional here
@@ -3399,7 +3399,7 @@ void OperatorWithKernel::BuildPhiKernelContext(
             attr_iter,
             Attrs().end(),
             platform::errors::NotFound("(%s) is not found in AttributeMap when "
-                                       "buildind static KernelContext.",
+                                       "building static KernelContext.",
                                        attr_names[i]));
         switch (AttrTypeID(attr_iter->second)) {
           case proto::AttrType::INTS: {
@@ -3473,7 +3473,7 @@ void OperatorWithKernel::BuildPhiKernelContext(
                             RuntimeAttrs().end(),
                             platform::errors::NotFound(
                                 "(%s) is not found in AttributeMap when "
-                                "buildind static KernelContext.",
+                                "building static KernelContext.",
                                 attr_names[i]));
         }
 
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 897e520813809c..c2b6c37e7dd6e6 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -639,15 +639,15 @@ void InitP2P(const std::vector<platform::Place> &places) {
     for (int i = 0; i < count; ++i) {
       for (int j = 0; j < count; ++j) {
         if (devices[i] == devices[j]) continue;
-        int can_acess = -1;
+        int can_access = -1;
 #ifdef PADDLE_WITH_HIP
         hipError_t ret =
-            hipDeviceCanAccessPeer(&can_acess, devices[i], devices[j]);
-        if (ret != hipSuccess || can_acess != 1) {
+            hipDeviceCanAccessPeer(&can_access, devices[i], devices[j]);
+        if (ret != hipSuccess || can_access != 1) {
 #else
         cudaError_t ret =
-            cudaDeviceCanAccessPeer(&can_acess, devices[i], devices[j]);
-        if (ret != cudaSuccess || can_acess != 1) {
+            cudaDeviceCanAccessPeer(&can_access, devices[i], devices[j]);
+        if (ret != cudaSuccess || can_access != 1) {
 #endif
           LOG(WARNING) << "Cannot enable P2P access from " << devices[i]
                        << " to " << devices[j];
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index fafde716b7bba7..bd869a05880671 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -710,8 +710,9 @@ void TensorFromStream(std::istream& is,
         PADDLE_THROW(platform::errors::Unimplemented(
             "XPUPlace is not supported when not compiled with XPU"));
       } else {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "CutomPlace is not supported when not compiled with CustomDevice"));
+        PADDLE_THROW(
+            platform::errors::Unimplemented("CustomPlace is not supported when "
+                                            "not compiled with CustomDevice"));
       }
 #endif
     } else {
@@ -887,7 +888,8 @@ std::ostream& print_tensor(std::ostream& os, const phi::DenseTensor& tensor) {
   auto element_num = tensor.numel();
 
   os << "  - data: [";
-  // Note: int8_t && uint8_t is typedf of char, ostream unable to print properly
+  // Note: int8_t && uint8_t is typedef of char, ostream unable to print
+  // properly
   if (typeid(int8_t) == typeid(T) || typeid(uint8_t) == typeid(T)) {
     if (element_num > 0) {
       os << signed(inspect[0]);
diff --git a/paddle/fluid/framework/trainer_factory.cc b/paddle/fluid/framework/trainer_factory.cc
index ba5dac4830aa18..81b2df6efc723d 100644
--- a/paddle/fluid/framework/trainer_factory.cc
+++ b/paddle/fluid/framework/trainer_factory.cc
@@ -26,8 +26,8 @@ namespace framework {
 
 class TrainerBase;
 
-typedef std::shared_ptr<TrainerBase> (*CreatetrainerFunction)();
-typedef std::unordered_map<std::string, CreatetrainerFunction> trainerMap;
+typedef std::shared_ptr<TrainerBase> (*CreateTrainerFunction)();
+typedef std::unordered_map<std::string, CreateTrainerFunction> trainerMap;
 trainerMap g_trainer_map;
 
 #define REGISTER_TRAINER_CLASS(trainer_class)                   \
diff --git a/paddle/fluid/operators/cvm_op.cc b/paddle/fluid/operators/cvm_op.cc
index 578a59130495ac..1e414ff217c2f1 100644
--- a/paddle/fluid/operators/cvm_op.cc
+++ b/paddle/fluid/operators/cvm_op.cc
@@ -127,7 +127,7 @@ class CVMOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X",
              "(LodTensor, default LodTensor<float>), a 2-D tensor with shape "
              "[N x D],"
-             " where N is the batch size and D is the emebdding dim. ");
+             " where N is the batch size and D is the embedding dim. ");
     AddInput("CVM",
              "(Tensor),  a 2-D Tensor with shape [N x 2], where N is the batch "
              "size, 2 is show and click.");
diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu
index 4575b54d48c9bf..555f83d61675ef 100644
--- a/paddle/fluid/platform/float16_test.cu
+++ b/paddle/fluid/platform/float16_test.cu
@@ -282,7 +282,7 @@ TEST(float16, compound_on_gpu) {
   TestDivAssign(6, 2, 3);
 }
 
-TEST(float16, comparision_on_gpu) {
+TEST(float16, comparison_on_gpu) {
   TestEqual(1, 1, true);
   TestEqual(1, 2, false);
   TestNotEqual(2, 3, true);
diff --git a/paddle/fluid/prim/api/manual_prim/utils/utils.h b/paddle/fluid/prim/api/manual_prim/utils/utils.h
index 90a25f8bf1e1fd..f3b21169e57f1a 100644
--- a/paddle/fluid/prim/api/manual_prim/utils/utils.h
+++ b/paddle/fluid/prim/api/manual_prim/utils/utils.h
@@ -29,7 +29,7 @@ namespace prim {
 // We put some api like utils here
 template <typename T>
 Tensor empty(const paddle::experimental::IntArray& shape,
-             phi::DataType dype,
+             phi::DataType dtype,
              const paddle::Place& place);
 
 template <typename T>
@@ -37,7 +37,7 @@ Tensor empty_like(const Tensor& x,
                   phi::DataType dtype,
                   const paddle::Place& place);
 
-// copy tensor for output ptr, in static need use assigh op
+// copy tensor for output ptr, in static need use assign op
 template <typename T>
 void by_pass(const Tensor& x, Tensor* out);
 
@@ -114,7 +114,7 @@ static std::vector<DST_T> unsafe_vector_cast(const std::vector<SRC_T>& src) {
   return dst;
 }
 
-// This fucction compute unsqueeze dims for reshape to replace unsqueeze.
+// This function compute unsqueeze dims for reshape to replace unsqueeze.
 static std::vector<int64_t> get_unsqueeze_dims(
     const Tensor& origin, const std::vector<int64_t>& axis) {
   auto origin_dims = origin.shape();
diff --git a/paddle/phi/kernels/prior_box_kernel.h b/paddle/phi/kernels/prior_box_kernel.h
index 45a741c7a3a72b..132efb7b6cc722 100644
--- a/paddle/phi/kernels/prior_box_kernel.h
+++ b/paddle/phi/kernels/prior_box_kernel.h
@@ -35,25 +35,25 @@ void PriorBoxKernel(const Context& ctx,
                     DenseTensor* out,
                     DenseTensor* var);
 
-inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
+inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratio,
                                bool flip,
-                               std::vector<float>* output_aspect_ratior) {
+                               std::vector<float>* output_aspect_ratio) {
   constexpr float epsilon = 1e-6;
-  output_aspect_ratior->clear();
-  output_aspect_ratior->push_back(1.0f);
-  for (size_t i = 0; i < input_aspect_ratior.size(); ++i) {
-    float ar = input_aspect_ratior[i];
+  output_aspect_ratio->clear();
+  output_aspect_ratio->push_back(1.0f);
+  for (size_t i = 0; i < input_aspect_ratio.size(); ++i) {
+    float ar = input_aspect_ratio[i];
     bool already_exist = false;
-    for (size_t j = 0; j < output_aspect_ratior->size(); ++j) {
-      if (fabs(ar - output_aspect_ratior->at(j)) < epsilon) {
+    for (size_t j = 0; j < output_aspect_ratio->size(); ++j) {
+      if (fabs(ar - output_aspect_ratio->at(j)) < epsilon) {
         already_exist = true;
         break;
       }
     }
     if (!already_exist) {
-      output_aspect_ratior->push_back(ar);
+      output_aspect_ratio->push_back(ar);
       if (flip) {
-        output_aspect_ratior->push_back(1.0f / ar);
+        output_aspect_ratio->push_back(1.0f / ar);
       }
     }
   }

From 317fad13a6d7cfcebd69405ad8a9c5561b117daf Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Fri, 1 Mar 2024 23:15:22 +0800
Subject: [PATCH 05/15] Fix maxinum maximum, etc (#62290)

---
 paddle/phi/kernels/bmm_kernel.h               |  2 +-
 .../kernels/xpu/instance_norm_grad_kernel.cc  |  2 +-
 paddle/phi/kernels/xpu/inverse_kernel.cc      |  2 +-
 .../phi/kernels/xpu/multiclass_nms3_kernel.cc |  2 +-
 paddle/phi/kernels/xpu/prelu_grad_kernel.cc   |  4 +--
 .../phi/kernels/xpu/reduce_max_grad_kernel.cc | 30 +++++++++----------
 .../phi/kernels/xpu/reduce_min_grad_kernel.cc | 30 +++++++++----------
 paddle/phi/kernels/xpu/rnn_util.h             |  2 +-
 .../phi/kernels/xpu/set_value_grad_kernel.cc  |  2 +-
 paddle/phi/kernels/xpu/set_value_kernel.cc    |  2 +-
 10 files changed, 39 insertions(+), 39 deletions(-)

diff --git a/paddle/phi/kernels/bmm_kernel.h b/paddle/phi/kernels/bmm_kernel.h
index 09e7f9647b68eb..6d3733bf750d3f 100644
--- a/paddle/phi/kernels/bmm_kernel.h
+++ b/paddle/phi/kernels/bmm_kernel.h
@@ -22,7 +22,7 @@ namespace phi {
  * @brief Bmm Kernel.
  *        Applies batched matrix multiplication to two tensors.
  *
- *        Both of the two input tensors must be three-dementional
+ *        Both of the two input tensors must be three-dimensional
  *        and share the same batch size.
  *        if x is a (b, m, k) tensor, y is a (b, k, n) tensor,
  *        the output will be a (b, m, n) tensor.
diff --git a/paddle/phi/kernels/xpu/instance_norm_grad_kernel.cc b/paddle/phi/kernels/xpu/instance_norm_grad_kernel.cc
index dba0e2ccfd7651..f1a217ed81ad35 100644
--- a/paddle/phi/kernels/xpu/instance_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/instance_norm_grad_kernel.cc
@@ -39,7 +39,7 @@ void InstanceNormGradKernel(const Context& dev_ctx,
       true,
       phi::errors::InvalidArgument(
           "The size of input's dimensions should be less equal than 5",
-          "and the dimension of D should be eaual to 1",
+          "and the dimension of D should be equal to 1",
           "But received: the size of input's dimensions is [%d]",
           x_dims.size()));
 
diff --git a/paddle/phi/kernels/xpu/inverse_kernel.cc b/paddle/phi/kernels/xpu/inverse_kernel.cc
index a48baa508ade00..966fcc97e0ab09 100644
--- a/paddle/phi/kernels/xpu/inverse_kernel.cc
+++ b/paddle/phi/kernels/xpu/inverse_kernel.cc
@@ -41,7 +41,7 @@ void InverseKernel(const Context& dev_ctx,
                     8192,
                     phi::errors::InvalidArgument(
                         "The size of a single matrix (%d bytes) exceeds the "
-                        "maxinum numbers of bytes xpu supports (8192).",
+                        "maximum numbers of bytes xpu supports (8192).",
                         n * n * sizeof(T)));
   auto RAII_GUARD = xpu::ctx_guard(dev_ctx.x_context());
   auto* info_xpu = RAII_GUARD.alloc_l3_or_gm<int>(batch);
diff --git a/paddle/phi/kernels/xpu/multiclass_nms3_kernel.cc b/paddle/phi/kernels/xpu/multiclass_nms3_kernel.cc
index 17746e4eeff0af..2f343ccc6b494e 100644
--- a/paddle/phi/kernels/xpu/multiclass_nms3_kernel.cc
+++ b/paddle/phi/kernels/xpu/multiclass_nms3_kernel.cc
@@ -90,7 +90,7 @@ void MultiClassNMSKernel(const Context& ctx,
     PADDLE_ENFORCE_EQ(
         boxes_count == score_dims[0],
         true,
-        phi::errors::InvalidArgument("boxes_count shuold equal score_dims[0].",
+        phi::errors::InvalidArgument("boxes_count should equal score_dims[0].",
                                      "But received: (%d) and (%d)",
                                      boxes_count,
                                      score_dims[0]));
diff --git a/paddle/phi/kernels/xpu/prelu_grad_kernel.cc b/paddle/phi/kernels/xpu/prelu_grad_kernel.cc
index fa43c908837664..b7c2157d55f43e 100644
--- a/paddle/phi/kernels/xpu/prelu_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/prelu_grad_kernel.cc
@@ -60,9 +60,9 @@ void PReluGradKernel(const Context& dev_ctx,
     }
   }
 
-  // mode = 0: channel_nchw, slope_shape = {c}, default. meanwhile, xhsape = {n,
+  // mode = 0: channel_nchw, slope_shape = {c}, default. meanwhile, xshape = {n,
   // c, h, w}
-  // mode = 1, channel_nhwc, slope_shape = {c}, meanwhile, xhsape = {n, h, w, c}
+  // mode = 1, channel_nhwc, slope_shape = {c}, meanwhile, xshape = {n, h, w, c}
   // mode = 2, elementwise, slope_shape = {c*h*w}
   // mode = 3, single slope, slope_shape = {1}
 
diff --git a/paddle/phi/kernels/xpu/reduce_max_grad_kernel.cc b/paddle/phi/kernels/xpu/reduce_max_grad_kernel.cc
index 846250c0677406..aa8736d84b71f0 100644
--- a/paddle/phi/kernels/xpu/reduce_max_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/reduce_max_grad_kernel.cc
@@ -60,23 +60,23 @@ void ReduceMaxGradKernel(const Context& dev_ctx,
     }
   }
 
-  T* brocast1 = nullptr;
-  T* brocast2 = nullptr;
+  T* broadcast1 = nullptr;
+  T* broadcast2 = nullptr;
   bool* equal = nullptr;
 
   xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
 
-  brocast1 = RAII_GUARD.alloc_l3_or_gm<T>(x.numel());
+  broadcast1 = RAII_GUARD.alloc_l3_or_gm<T>(x.numel());
   PADDLE_ENFORCE_NOT_NULL(
-      brocast1, errors::ResourceExhausted("XPU has no enough memory"));
+      broadcast1, errors::ResourceExhausted("XPU has no enough memory"));
 
   equal = RAII_GUARD.alloc_l3_or_gm<bool>(x.numel());
   PADDLE_ENFORCE_NOT_NULL(
       equal, errors::ResourceExhausted("XPU has no enough memory"));
 
-  brocast2 = RAII_GUARD.alloc_l3_or_gm<T>(x.numel());
+  broadcast2 = RAII_GUARD.alloc_l3_or_gm<T>(x.numel());
   PADDLE_ENFORCE_NOT_NULL(
-      brocast2, errors::ResourceExhausted("XPU has no enough memory"));
+      broadcast2, errors::ResourceExhausted("XPU has no enough memory"));
 
   // use [1] to replace [], because xpu not support []
   if (xdims.size() == 0) {
@@ -86,25 +86,25 @@ void ReduceMaxGradKernel(const Context& dev_ctx,
     ydims = std::vector<int>({1});
   }
 
-  // step 1. brocast out and out_grad
-  int r =
-      xpu::broadcast<T>(dev_ctx.x_context(), out_data, brocast1, ydims, xdims);
+  // step 1. broadcast out and out_grad
+  int r = xpu::broadcast<T>(
+      dev_ctx.x_context(), out_data, broadcast1, ydims, xdims);
   PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast");
 
   r = xpu::broadcast<T>(
-      dev_ctx.x_context(), out_grad_data, brocast2, ydims, xdims);
+      dev_ctx.x_context(), out_grad_data, broadcast2, ydims, xdims);
   PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast");
 
-  // step 2. comparse out_brocast and x
-  r = xpu::equal<T>(dev_ctx.x_context(), x_data, brocast1, equal, x.numel());
+  // step 2. compare out_broadcast and x
+  r = xpu::equal<T>(dev_ctx.x_context(), x_data, broadcast1, equal, x.numel());
   PADDLE_ENFORCE_XDNN_SUCCESS(r, "equal");
   // step 3. get x_grad
-  r = xpu::constant<T>(dev_ctx.x_context(), brocast1, x.numel(), 0);
+  r = xpu::constant<T>(dev_ctx.x_context(), broadcast1, x.numel(), 0);
   PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
   r = xpu::select<T>(dev_ctx.x_context(),
                      equal,
-                     brocast2,
-                     brocast1,
+                     broadcast2,
+                     broadcast1,
                      x_grad_data,
                      xdims,
                      xdims);
diff --git a/paddle/phi/kernels/xpu/reduce_min_grad_kernel.cc b/paddle/phi/kernels/xpu/reduce_min_grad_kernel.cc
index 9019cb0834d72e..aefcc74b450919 100644
--- a/paddle/phi/kernels/xpu/reduce_min_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/reduce_min_grad_kernel.cc
@@ -60,23 +60,23 @@ void ReduceMinGradKernel(const Context& dev_ctx,
     }
   }
 
-  T* brocast1 = nullptr;
-  T* brocast2 = nullptr;
+  T* broadcast1 = nullptr;
+  T* broadcast2 = nullptr;
   bool* equal = nullptr;
 
   xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
 
-  brocast1 = RAII_GUARD.alloc_l3_or_gm<T>(x.numel());
+  broadcast1 = RAII_GUARD.alloc_l3_or_gm<T>(x.numel());
   PADDLE_ENFORCE_NOT_NULL(
-      brocast1, errors::ResourceExhausted("XPU has no enough memory"));
+      broadcast1, errors::ResourceExhausted("XPU has no enough memory"));
 
   equal = RAII_GUARD.alloc_l3_or_gm<bool>(x.numel());
   PADDLE_ENFORCE_NOT_NULL(
       equal, errors::ResourceExhausted("XPU has no enough memory"));
 
-  brocast2 = RAII_GUARD.alloc_l3_or_gm<T>(x.numel());
+  broadcast2 = RAII_GUARD.alloc_l3_or_gm<T>(x.numel());
   PADDLE_ENFORCE_NOT_NULL(
-      brocast2, errors::ResourceExhausted("XPU has no enough memory"));
+      broadcast2, errors::ResourceExhausted("XPU has no enough memory"));
 
   // use [1] to replace [], because xpu not support []
   if (xdims.size() == 0) {
@@ -86,25 +86,25 @@ void ReduceMinGradKernel(const Context& dev_ctx,
     ydims = std::vector<int>({1});
   }
 
-  // step 1. brocast out and out_grad
-  int r =
-      xpu::broadcast<T>(dev_ctx.x_context(), out_data, brocast1, ydims, xdims);
+  // step 1. broadcast out and out_grad
+  int r = xpu::broadcast<T>(
+      dev_ctx.x_context(), out_data, broadcast1, ydims, xdims);
   PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast");
 
   r = xpu::broadcast<T>(
-      dev_ctx.x_context(), out_grad_data, brocast2, ydims, xdims);
+      dev_ctx.x_context(), out_grad_data, broadcast2, ydims, xdims);
   PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast");
 
-  // step 2. comparse out_brocast and x
-  r = xpu::equal<T>(dev_ctx.x_context(), x_data, brocast1, equal, x.numel());
+  // step 2. compare out_broadcast and x
+  r = xpu::equal<T>(dev_ctx.x_context(), x_data, broadcast1, equal, x.numel());
   PADDLE_ENFORCE_XDNN_SUCCESS(r, "equal");
   // step 3. get x_grad
-  r = xpu::constant<T>(dev_ctx.x_context(), brocast1, x.numel(), 0);
+  r = xpu::constant<T>(dev_ctx.x_context(), broadcast1, x.numel(), 0);
   PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
   r = xpu::select<T>(dev_ctx.x_context(),
                      equal,
-                     brocast2,
-                     brocast1,
+                     broadcast2,
+                     broadcast1,
                      x_grad_data,
                      xdims,
                      xdims);
diff --git a/paddle/phi/kernels/xpu/rnn_util.h b/paddle/phi/kernels/xpu/rnn_util.h
index 5310b35e64dc36..7948bb2defa0ca 100644
--- a/paddle/phi/kernels/xpu/rnn_util.h
+++ b/paddle/phi/kernels/xpu/rnn_util.h
@@ -23,7 +23,7 @@ void ResetParameterVector(const std::vector<TensorType>& raw_params_vec,
                           const int& num_layers,
                           const bool& is_bidirec,
                           std::vector<std::vector<T*>>* params_vec) {
-  // the parameter raw seuquence is [FWhi, FWhh, BWhi, BWhh] * num_layers
+  // the parameter raw sequence is [FWhi, FWhh, BWhi, BWhh] * num_layers
   // + [FBhi, FBhh, BBhi, BBhh] * num_layers, we will reset the parameter to
   // ([FWhi, FWhh, FBhi, FBhh] + [BWhi, BWhh, BBhi, BBhh]) * num_layers
   const int& direction_num = is_bidirec ? 2 : 1;
diff --git a/paddle/phi/kernels/xpu/set_value_grad_kernel.cc b/paddle/phi/kernels/xpu/set_value_grad_kernel.cc
index c5d33ae4ac8d06..227d6b39c9f281 100644
--- a/paddle/phi/kernels/xpu/set_value_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/set_value_grad_kernel.cc
@@ -203,7 +203,7 @@ void SetValueGradImpl(const Context& dev_ctx,
       auto value_grad_dims = value_grad->dims();
       auto fake_value_grad_dims = out_dims;
 
-      // Create an extented shape according to the rules of broadcast.
+      // Create an extended shape according to the rules of broadcast.
       auto value_grad_dims_size = value_grad_dims.size();
 
       int num_decrease = 0;
diff --git a/paddle/phi/kernels/xpu/set_value_kernel.cc b/paddle/phi/kernels/xpu/set_value_kernel.cc
index c457a6d21fd8a1..60b0fff7d9d7c8 100644
--- a/paddle/phi/kernels/xpu/set_value_kernel.cc
+++ b/paddle/phi/kernels/xpu/set_value_kernel.cc
@@ -263,7 +263,7 @@ void SetValueKernelImpl(const Context& dev_ctx,
                         const std::vector<int64_t>& decrease_axes,
                         const std::vector<int64_t>& none_axes,
                         DenseTensor* out) {
-  // rank是xtensor的维度信息
+  // rank是x tensor的维度信息
   const int rank = x.dims().size();
 
   switch (rank) {

From 13d74009555434d6327a00a01aee68fc111c14bb Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Fri, 1 Mar 2024 23:17:04 +0800
Subject: [PATCH 06/15] Update kernel_backward.h (#62288)

---
 .../fusion/cutlass/memory_efficient_attention/kernel_backward.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/kernel_backward.h b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/kernel_backward.h
index 31ce0bd3574ee9..2bd3ac2db5f5b7 100644
--- a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/kernel_backward.h
+++ b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/kernel_backward.h
@@ -492,8 +492,6 @@ struct AttentionBackwardKernel {
           scalar_t,  // ElementC
           accum_t    // ElementAccumulator
           >;
-  static constexpr auto kOptimalAlignement =
-      std::max(DefaultConfig::kAlignmentA, DefaultConfig::kAlignmentB);
   static constexpr auto kMinimumAlignment = GemmType::kMinimumAlignment;
 
   struct MatmulQK {

From 06d3a5de0321e2d23787a1a6ea1e4572e294585b Mon Sep 17 00:00:00 2001
From: Yuanle Liu <yuanlehome@163.com>
Date: Sat, 2 Mar 2024 04:32:36 +0800
Subject: [PATCH 07/15] Fix copy *.h on paddle/pir dir introduced from PR#61863
 (#62293)

---
 python/setup.py.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/setup.py.in b/python/setup.py.in
index 9fd352ddd26be0..3ba1dc05e4976d 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -876,7 +876,7 @@ headers = (
     # init headers
     list(find_files('init_phi.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/platform')) +  # phi init headers
     # init headers
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/pir/include')) +  # pir init headers
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/pir/include', recursive=True)) +  # pir init headers
     # init headers
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/pir/drr/include')) +  # drr init headers
     # init headers

From cbe8810bbea29c28cc99ccd764134dd30fb61e84 Mon Sep 17 00:00:00 2001
From: lanxianghit <47554610+lanxianghit@users.noreply.github.com>
Date: Sat, 2 Mar 2024 08:19:07 +0800
Subject: [PATCH 08/15] [PIR][DynamicShape] Fix bug in slice op's
 InferSymbolicShape (#62247)

* Fix bug in slice op's InferSymbolicShape

* add more tests

* fix ci
---
 .../infer_symbolic_shape/infer_sym_utils.cc   |  11 +
 .../infer_symbolic_shape/infer_sym_utils.h    |   8 +
 .../paddle_op_infer_sym.cc                    | 241 +++++++++++-------
 .../shape_dialect/shape_optimization_test.cc  |   8 +-
 .../cinn/symbolic/test_op_infer_sym_shape.py  |  58 +++++
 5 files changed, 231 insertions(+), 95 deletions(-)

diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.cc
index 4e5f5df08732a0..5675429b5c65f2 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.cc
@@ -16,6 +16,17 @@
 
 namespace paddle::dialect::details {
 
+std::optional<std::vector<int64_t>> VecExpr2Int64(const ExprVec &expr_vec) {
+  std::vector<int64_t> int64vec;
+  for (auto item : expr_vec) {
+    if (!item.isa<int64_t>()) {
+      return std::nullopt;
+    }
+    int64vec.push_back(item.Get<int64_t>());
+  }
+  return int64vec;
+}
+
 bool ReduceInferDim(pir::Operation *op,
                     pir::ShapeConstraintIRAnalysis *shape_analysis,
                     const std::vector<int64_t> &axis,
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h
index 8a14e40e6337af..d2d508ff5890db 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h
@@ -17,6 +17,12 @@
 #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
 #include "paddle/pir/include/dialect/shape/utils/shape_analysis.h"
 
+// To make codes shorter
+using ExprVec = std::vector<symbol::DimExpr>;
+using ShapeOrData = symbol::ShapeOrDataDimExprs;
+using TensorExprs = symbol::TensorShapeOrDataDimExprs;
+using TensorListExprs = symbol::TensorListShapeOrDataDimExprs;
+
 namespace paddle::dialect::details {
 template <typename T>
 struct AttributeTrait;
@@ -60,6 +66,8 @@ std::vector<T> GetVectorAttr(const ::pir::Operation *op,
   return vec_res;
 }
 
+std::optional<std::vector<int64_t>> VecExpr2Int64(const ExprVec &expr_vec);
+
 bool ReduceInferDim(pir::Operation *op,
                     pir::ShapeConstraintIRAnalysis *shape_analysis,
                     const std::vector<int64_t> &axis,
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc
index d95f1095635184..1be26c82f4c21a 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc
@@ -19,11 +19,6 @@
 
 namespace paddle::dialect {
 
-// To make codes shorter
-using ShapeOrData = symbol::ShapeOrDataDimExprs;
-using TensorExprs = symbol::TensorShapeOrDataDimExprs;
-using TensorListExprs = symbol::TensorListShapeOrDataDimExprs;
-
 bool DataOpInferSymbolicShape(pir::Operation *op,
                               pir::ShapeConstraintIRAnalysis *shape_analysis) {
   const auto &attributes = op->attributes();
@@ -270,9 +265,104 @@ bool FullIntArrayOpInferSymbolicShape(
   return true;
 }
 
+inline void CheckAndUpdateSliceAttrs(
+    const ExprVec &in_dims,
+    const std::vector<int64_t> &axes,
+    ExprVec *starts_p,
+    ExprVec *ends_p,
+    std::vector<int64_t> *infer_flags = nullptr) {
+  auto vec_int64 = details::VecExpr2Int64(*starts_p);
+  IR_ENFORCE(vec_int64.has_value(),
+             "for slice op, all the elements in `starts` must be int64_t");
+  std::vector<int64_t> starts_int = vec_int64.value();
+
+  vec_int64 = details::VecExpr2Int64(*ends_p);
+  IR_ENFORCE(vec_int64.has_value(),
+             "for slice op, all the elements in `ends` must be int64_t");
+  std::vector<int64_t> ends_int = vec_int64.value();
+
+  ExprVec &starts = *starts_p;
+  ExprVec &ends = *ends_p;
+  auto IsMaxInt = [](const symbol::DimExpr &expr) {
+    return expr.isa<int64_t>() &&
+           expr.Get<int64_t>() ==
+               static_cast<int64_t>(std::numeric_limits<int>::max());
+  };
+
+  for (size_t i = 0; i < axes.size(); ++i) {
+    int64_t axis = axes[i];
+
+    if (infer_flags != nullptr && (*infer_flags)[i] == -1) {
+      PADDLE_THROW(
+          phi::errors::Unimplemented("SliceOpInferSymbolicShape CAN NOT "
+                                     "deal with -1 in infer_flags now"));
+    }
+
+    // For both start and end can be negtive or positive, we need to handle the
+    // following different arrangements.
+    ends[i] = IsMaxInt(ends[i]) ? in_dims[axis] : ends[i];
+
+    bool both_negative_or_positive = (starts_int[i] >= 0 && ends_int[i] >= 0) ||
+                                     (starts_int[i] <= 0 && ends_int[i] <= 0);
+    bool start_negative_end_positive = starts_int[i] <= 0 && ends_int[i] >= 0;
+    bool start_positive_end_negative = starts_int[i] >= 0 && ends_int[i] <= 0;
+
+    if (both_negative_or_positive) {
+      continue;
+    } else if (start_negative_end_positive) {
+      starts[i] = starts[i] + in_dims[axis];
+    } else if (start_positive_end_negative) {
+      starts[i] = starts[i] - in_dims[axis];
+    } else {
+      LOG(FATAL) << "Dead code";
+    }
+  }
+}
+
+inline ExprVec GetSliceDims(const ExprVec &in_dims,
+                            const std::vector<int64_t> &axes,
+                            const ExprVec &starts,
+                            const ExprVec &ends,
+                            std::vector<int64_t> *infer_flags = nullptr) {
+  ExprVec slice_dims(in_dims);
+
+  for (size_t i = 0; i < axes.size(); ++i) {
+    int64_t axis = axes[i];
+
+    if (infer_flags != nullptr && (*infer_flags)[i] == -1) {
+      PADDLE_THROW(
+          phi::errors::Unimplemented("SliceOpInferSymbolicShape CAN NOT "
+                                     "deal with -1 in infer_flags now"));
+    }
+
+    slice_dims[axis] = ends[i] - starts[i];
+  }
+
+  return slice_dims;
+}
+
+inline ExprVec GetDecreasedDims(const ExprVec &slice_dims,
+                                const std::vector<int64_t> &decrease_axes) {
+  ExprVec decreased_dims(slice_dims);
+  std::vector<uint8_t> decrease_flag(slice_dims.size(), 0);
+  if (decrease_axes.size() > 0) {
+    for (size_t i = 0; i < decrease_axes.size(); ++i) {
+      int64_t axis = decrease_axes[i];
+      decrease_flag[axis] = 1;
+    }
+    ExprVec new_shape;
+    for (size_t i = 0; i < slice_dims.size(); ++i) {
+      if (decrease_flag[i] == 0) {
+        new_shape.emplace_back(slice_dims[i]);
+      }
+    }
+    decreased_dims = new_shape;
+  }
+  return decreased_dims;
+}
+
 bool SliceOpInferSymbolicShape(pir::Operation *op,
                                pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  // TODO(zhangbopd): Not implemented yet.
   pir::Value operand_source = op->operand_source(0);
   pir::Value operand_starts = op->operand_source(1);
   pir::Value operand_ends = op->operand_source(2);
@@ -285,107 +375,76 @@ bool SliceOpInferSymbolicShape(pir::Operation *op,
   const symbol::ShapeOrDataDimExprs &ends_shape_data =
       shape_analysis->GetShapeOrDataForValue(operand_ends);
 
-  // Currently, we DO NOT support the case that any element in `axes` `starts`
-  // or `ends` is a Symbol.
   const std::vector<int64_t> axes = [&] {
-    const auto &attributes = op->attributes();
-    pir::Attribute attr_axes = attributes.at("axes");
-
-    const auto &axes_vec = attr_axes.dyn_cast<pir::ArrayAttribute>().AsVector();
-    std::vector<int64_t> axes;
+    std::vector<int64_t> axes_vec = details::GetVectorAttr(op, "axes");
     int64_t rank = int64_t(operand_shape_or_data.shape().size());
-    for (auto item : axes_vec) {
-      int64_t axis = item.dyn_cast<pir::Int64Attribute>().data();
-      axes.emplace_back(axis >= 0 ? axis : std::max(int64_t(0), axis + rank));
+    for (size_t i = 0; i < axes_vec.size(); i++) {
+      int64_t axis = axes_vec[i];
+      axes_vec[i] = axis >= 0 ? axis : std::max(int64_t(0), axis + rank);
     }
-    return axes;
+    return axes_vec;
   }();
 
-  const std::vector<int64_t> starts = [&] {
-    std::vector<int64_t> starts;
-    for (auto item : starts_shape_data.data().value()) {
-      IR_ENFORCE(item.isa<int64_t>(),
-                 "Currently, we DO NOT support the case that any element in "
-                 "`starts` is a Symbol.");
-      starts.push_back(item.Get<int64_t>());
-    }
-    return starts;
-  }();
+  // Currently, we DO NOT support any element in `starts` is a Symbol.
+  ExprVec starts = starts_shape_data.data().value();
+  ExprVec ends = ends_shape_data.data().value();
 
-  const std::vector<int64_t> ends = [&] {
-    std::vector<int64_t> ends;
-    for (auto item : ends_shape_data.data().value()) {
-      IR_ENFORCE(item.isa<int64_t>(),
-                 "Currently, we DO NOT support the case that any element in "
-                 "`ends` is a Symbol.");
-      ends.push_back(item.Get<int64_t>());
+  std::vector<int64_t> infer_flags = [op, &axes] {
+    std::vector<int64_t> infer_flags_t =
+        details::GetVectorAttr(op, "infer_flags");
+    if (infer_flags_t.empty()) {
+      infer_flags_t = std::vector<int64_t>(axes.size(), 1);
     }
-    return ends;
+    return infer_flags_t;
   }();
 
-  // When `pd.slice` is operating on a tensor which is produced by a `pd.shape`
-  // op, the reseult should be written into data.
-  const auto &GetDataDimExprs = [&]() -> symbol::ShapeOrDataDimExprs {
-    const std::vector<symbol::DimExpr> out_data = [&] {
-      std::vector<symbol::DimExpr> out_data;
-      const int64_t start =
-          starts[0] < 0
-              ? starts[0] + operand_shape_or_data.data().value().size()
-              : starts[0];
-      const int64_t end =
-          static_cast<int64_t>(std::numeric_limits<int>::max()) == ends[0]
-              ? operand_shape_or_data.data().value().size()
-              : ends[0];
-
-      for (int64_t i = start; i < end; i++) {
-        out_data.push_back(operand_shape_or_data.data().value()[i]);
-      }
-      return out_data;
-    }();
-    const std::vector<symbol::DimExpr> shape{std::int64_t(out_data.size())};
-    return symbol::ShapeOrDataDimExprs{
-        symbol::TensorShapeOrDataDimExprs(shape, out_data)};
-  };
+  const std::vector<int64_t> decrease_axis =
+      details::GetVectorAttr(op, "decrease_axis");
 
-  // Othewise, the reseult should be written into the shape.
   const auto &GetShapeDimExprs = [&]() -> symbol::ShapeOrDataDimExprs {
-    std::vector<symbol::DimExpr> out_shape = operand_shape_or_data.shape();
+    const ExprVec &in_dims = operand_shape_or_data.shape();
+    CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends, &infer_flags);
+    ExprVec slice_dims =
+        GetSliceDims(in_dims, axes, starts, ends, &infer_flags);
+    ExprVec out_dims = GetDecreasedDims(slice_dims, decrease_axis);
 
-    const std::vector<symbol::DimExpr> &dim_expr_starts =
-        starts_shape_data.data().value();
-    const std::vector<symbol::DimExpr> &dim_expr_ends =
-        ends_shape_data.data().value();
+    return symbol::ShapeOrDataDimExprs{
+        symbol::TensorShapeOrDataDimExprs(out_dims)};
+  };
 
-    // For both start and end can be negtive or positive, we need to handle the
-    // following different arrangements.
-    auto IsMaxInt = [](const symbol::DimExpr &expr) {
-      return expr.isa<int64_t>() &&
-             expr.Get<int64_t>() ==
-                 static_cast<int64_t>(std::numeric_limits<int>::max());
-    };
-    for (size_t i = 0; i < axes.size(); ++i) {
-      const int64_t axis = axes[i];
-      auto end =
-          IsMaxInt(dim_expr_ends[i]) ? out_shape[axis] : dim_expr_ends[i];
-
-      bool both_negative_or_positive =
-          (starts[i] >= 0 && ends[i] >= 0) || (starts[i] <= 0 && ends[i] <= 0);
-      bool start_negative_end_positive = starts[i] <= 0 && ends[i] >= 0;
-      bool start_positive_end_negative = starts[i] >= 0 && ends[i] <= 0;
-
-      if (both_negative_or_positive) {
-        out_shape[axis] = end - dim_expr_starts[i];
-      } else if (start_negative_end_positive) {
-        out_shape[axis] = end - dim_expr_starts[i] - out_shape[axis];
-      } else if (start_positive_end_negative) {
-        out_shape[axis] = out_shape[axis] - dim_expr_starts[i] + end;
-      } else {
-        LOG(FATAL) << "Dead code";
-      }
+  // When `pd.slice` is operating on a tensor which is produced by a `pd.shape`
+  // op, the reseult should be written into data.
+  const auto &GetDataDimExprs = [&]() -> symbol::ShapeOrDataDimExprs {
+    std::vector<symbol::DimExpr> out_data;
+
+    // Currently, we DO NOT support the case that any element in `axes` `starts`
+    // or `ends` is a Symbol.
+    auto vec_int64 = details::VecExpr2Int64(starts);
+    IR_ENFORCE(vec_int64.has_value(),
+               "for slice op, all the elements in `starts` must be int64_t");
+    std::vector<int64_t> starts_int = vec_int64.value();
+
+    vec_int64 = details::VecExpr2Int64(ends);
+    IR_ENFORCE(vec_int64.has_value(),
+               "for slice op, all the elements in `ends` must be int64_t");
+    std::vector<int64_t> ends_int = vec_int64.value();
+
+    const int64_t start =
+        starts_int[0] < 0
+            ? starts_int[0] + operand_shape_or_data.data().value().size()
+            : starts_int[0];
+    const int64_t end =
+        static_cast<int64_t>(std::numeric_limits<int>::max()) == ends_int[0]
+            ? operand_shape_or_data.data().value().size()
+            : ends_int[0];
+
+    for (int64_t i = start; i < end; i++) {
+      out_data.push_back(operand_shape_or_data.data().value()[i]);
     }
 
+    const std::vector<symbol::DimExpr> shape{std::int64_t(out_data.size())};
     return symbol::ShapeOrDataDimExprs{
-        symbol::TensorShapeOrDataDimExprs(out_shape)};
+        symbol::TensorShapeOrDataDimExprs(shape, out_data)};
   };
 
   symbol::ShapeOrDataDimExprs shape_data =
diff --git a/test/cpp/pir/shape_dialect/shape_optimization_test.cc b/test/cpp/pir/shape_dialect/shape_optimization_test.cc
index b48f84db4d1b80..faefec6e7ec416 100644
--- a/test/cpp/pir/shape_dialect/shape_optimization_test.cc
+++ b/test/cpp/pir/shape_dialect/shape_optimization_test.cc
@@ -122,10 +122,10 @@ TEST(shape_optimization, shape_optimization_pass) {
             "Mul(Mul(Mul(Mul(1, S1), 128), 32), 1 / (128))");
   EXPECT_EQ(cast_res.shape()[3], 2);
 
-  EXPECT_EQ(symbol::ToString(relu_res.shape()[0]), "Add(Add(S2, -2), -2)");
-  EXPECT_EQ(symbol::ToString(relu_res.shape()[1]), "Add(Add(S3, -2), -2)");
-  EXPECT_EQ(symbol::ToString(relu_res.shape()[2]), "Add(Add(S4, -2), -2)");
-  EXPECT_EQ(symbol::ToString(relu_res.shape()[3]), "Add(Add(S5, -2), -2)");
+  EXPECT_EQ(symbol::ToString(relu_res.shape()[0]), "Add(-2, -Add(2, -S2))");
+  EXPECT_EQ(symbol::ToString(relu_res.shape()[1]), "Add(-2, -Add(2, -S3))");
+  EXPECT_EQ(symbol::ToString(relu_res.shape()[2]), "Add(-2, -Add(2, -S4))");
+  EXPECT_EQ(symbol::ToString(relu_res.shape()[3]), "Add(-2, -Add(2, -S5))");
 
   EXPECT_EQ(subtract_res.shape()[0], 1);
   EXPECT_EQ(subtract_res.shape()[1], 64);
diff --git a/test/ir/pir/cinn/symbolic/test_op_infer_sym_shape.py b/test/ir/pir/cinn/symbolic/test_op_infer_sym_shape.py
index 61ca48f19d797c..4ab27bf657eac9 100644
--- a/test/ir/pir/cinn/symbolic/test_op_infer_sym_shape.py
+++ b/test/ir/pir/cinn/symbolic/test_op_infer_sym_shape.py
@@ -459,5 +459,63 @@ def test_eval_symbolic(self):
         return True
 
 
+class SliceNet(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        out = x[:, -1, :]
+        out = x[1:3, 0:2, 2:4]
+
+        axes = [0, 1, 2]
+        starts = [-3, 0, 2]
+        ends = [3, 2, 4]
+        out = paddle.slice(x, axes=axes, starts=starts, ends=ends)
+
+        return out
+
+
+class TestSliceOpInferSymbolicShape(TestBase):
+    def prepare_data(self):
+        self.cases = [np.random.rand(4, 5, 6)]
+
+        self.expected = [
+            [
+                'shape[S0, S2], data[NULL]',
+                'shape[2, 2, 2], data[NULL]',
+                'shape[Add(3, -Add(-3, S0)), 2, 2]',
+            ]
+        ]
+
+    def test_eval_symbolic(self):
+        net = SliceNet()
+
+        for i in range(len(self.cases)):
+            x = self.cases[i]
+            x_spec = InputSpec(
+                shape=[None for index in range(len(x.shape))], dtype='float32'
+            )
+
+            input_spec = [x_spec]
+            net = apply_to_static(net, False, input_spec)
+            net.eval()
+
+            # check the infer result
+            sym_shape_str_list = get_sym_shape_str_for_op(
+                net, input_spec, 'pd_op.slice'
+            )
+            np.testing.assert_equal(
+                len(sym_shape_str_list), len(self.expected[i])
+            )
+            for j in range(len(sym_shape_str_list)):
+                np.testing.assert_equal(
+                    sym_shape_str_list[j].find(self.expected[i][j]),
+                    0,
+                    f'in case i,j = {i},{j}: output shape ({sym_shape_str_list[0]}) is not expected {(self.expected[i][j])}',
+                )
+
+        return True
+
+
 if __name__ == '__main__':
     unittest.main()

From f445bd8d31a8dc283d63dc282dc09082bf77a059 Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Sat, 2 Mar 2024 08:48:30 +0800
Subject: [PATCH 09/15] [DRR]Fix SegmentFault for BlockArgument while applying
 pass in Llama2 infer (#62283)

* [DRR]Fix SegmentFault for BlockArgument while applying pass in Llama2 infer

* fix typo
---
 paddle/fluid/pir/drr/src/rewrite_pattern.cc | 137 ++++++++++++--------
 1 file changed, 85 insertions(+), 52 deletions(-)

diff --git a/paddle/fluid/pir/drr/src/rewrite_pattern.cc b/paddle/fluid/pir/drr/src/rewrite_pattern.cc
index 46b034aca85582..e19d5ae224c7d3 100644
--- a/paddle/fluid/pir/drr/src/rewrite_pattern.cc
+++ b/paddle/fluid/pir/drr/src/rewrite_pattern.cc
@@ -258,95 +258,128 @@ bool DrrRewritePattern::MatchFromOutputToInput(
   std::unordered_set<pir::Operation*> ir_visited;
   std::queue<const OpCall*> drr_q;
   std::queue<pir::Operation*> ir_q;
-  bool matched = true;
-  size_t step = 0;
-  for (auto it = output_op_map.begin(); it != output_op_map.end(); ++it) {
-    VLOG(6) << "match (" << it->first->name() << " @" << it->first << " : @"
-            << it->second << ") in source_pattern_graph ";
-    drr_q.push(it->first);
-    drr_visited.insert(it->first);
-    ir_q.push(it->second);
-    ir_visited.insert(it->second);
-  }
-  while (!drr_q.empty()) {
-    if (!matched) break;
-    auto* drr_node = drr_q.front();
-    auto* ir_node = ir_q.front();
-    drr_q.pop();
-    ir_q.pop();
+  // Initialize DRR matched queue.
+  const auto& InitDrrQueue = [&]() -> void {
+    for (auto it = output_op_map.begin(); it != output_op_map.end(); ++it) {
+      VLOG(6) << "match (" << it->first->name() << " @" << it->first << " : @"
+              << it->second << ") in source_pattern_graph ";
+      drr_q.push(it->first);
+      drr_visited.insert(it->first);
+      ir_q.push(it->second);
+      ir_visited.insert(it->second);
+    }
+  };
+  // Check whether DrrNode and Operation have the same Operands and Results
+  // information.
+  const auto& IsSameOperandsAndResults =
+      [](const OpCall* drr_node, const pir::Operation* ir_node) -> bool {
     if (drr_node->name() != ir_node->name()) {
-      matched = false;
       VLOG(8) << "Match failed: drr_node(" << drr_node->name()
               << ") != pir_node(" << ir_node->name() << ").";
-      break;
+      return false;
     }
     const auto& drr_input_tensors = drr_node->inputs();
     auto ir_input_value_size = ir_node->num_operands();
     if (drr_input_tensors.size() != ir_input_value_size) {
-      matched = false;
       VLOG(8) << drr_node->name() << " Match failed: drr input tensors("
               << drr_input_tensors.size() << ") != pir input tensors("
               << ir_input_value_size << ").";
-      break;
+      return false;
     }
     if (drr_node->outputs().size() != ir_node->num_results()) {
-      matched = false;
       VLOG(8) << drr_node->name() << " Match failed: drr output tensors("
               << drr_node->outputs().size() << ") != pir output tensors("
               << ir_node->num_results() << ").";
+      return false;
+    }
+    return true;
+  };
+  // Check whether source_pattern_match_ctx has visited Operation's Operands.
+  const auto& HasVisitedOperands = [&](const Tensor* drr_input_tensor,
+                                       pir::Value ir_value) -> bool {
+    const auto& tensor_name = drr_input_tensor->name();
+    if (ir_value.isa<pir::BlockArgument>()) {
+      VLOG(8) << "Match Attention! Found BlockArgument as input of "
+              << tensor_name;
+    }
+    return source_pattern_match_ctx->tensor_map().count(tensor_name) != 0 &&
+           ir_value != source_pattern_match_ctx->tensor_map().at(tensor_name);
+  };
+  // Update drr_q et.al information. Return false if faild.
+  const auto& TryUpdateDrrQueue = [&](const OpCall* drr_producer_op,
+                                      pir::Operation* ir_producer_op) -> bool {
+    // still return true if both visited.
+    if (drr_visited.count(drr_producer_op) &&
+        ir_visited.count(ir_producer_op)) {
+      return true;
+    }
+    // insert map if both not visited.
+    if (!drr_visited.count(drr_producer_op) &&
+        !ir_visited.count(ir_producer_op)) {
+      drr_q.push(drr_producer_op);
+      ir_q.push(ir_producer_op);
+      drr_visited.insert(drr_producer_op);
+      ir_visited.insert(ir_producer_op);
+      return true;
+    }
+    return false;
+  };
+
+  // Step 1: Initialize DRR matched queue.
+  bool matched = true;
+  size_t step = 0;
+  InitDrrQueue();
+
+  while (!drr_q.empty()) {
+    if (!matched) break;
+    auto* drr_node = drr_q.front();
+    auto* ir_node = ir_q.front();
+    drr_q.pop();
+    ir_q.pop();
+    if (!IsSameOperandsAndResults(drr_node, ir_node)) {
+      matched = false;
       break;
     }
+    // Step 1: Bind Operation of current op to match_ctx.
     source_pattern_match_ctx->BindIrOperation(drr_node, ir_node);
-    // binding input_tensor of current_op
+
+    // Step 2: Bind input_tensor of current op to match_ctx.
+    const auto& drr_input_tensors = drr_node->inputs();
+    auto ir_input_values = ir_node->operands_source();
     for (size_t i = 0; i < drr_input_tensors.size(); ++i) {
-      if (source_pattern_match_ctx->tensor_map().count(
-              drr_input_tensors[i]->name()) != 0 &&
-          ir_node->operand(i).source() !=
-              source_pattern_match_ctx->tensor_map().at(
-                  drr_input_tensors[i]->name())) {
+      if (HasVisitedOperands(drr_input_tensors[i], ir_input_values[i])) {
         matched = false;
         VLOG(8) << " tensor_map key[" << drr_input_tensors[i]->name()
                 << "] already exists,but value is different!";
         break;
-      } else {
-        source_pattern_match_ctx->BindIrValue(drr_input_tensors[i]->name(),
-                                              ir_node->operand(i).source());
-      }
-
-      if (ir_node->operand_source(i).isa<pir::BlockArgument>()) {
-        VLOG(8) << "Match Attention! Found BlockArgument as input of "
-                << drr_node->name();
       }
-
+      source_pattern_match_ctx->BindIrValue(drr_input_tensors[i]->name(),
+                                            ir_input_values[i]);
+      // Skip it while drr_producer_op is nullptr for trigger pattern boundary.
       auto* drr_producer_op = drr_input_tensors[i]->producer();
       if (drr_producer_op == nullptr) {
         continue;
       }
-
+      // Check whether tensor and value have the same use_count.
       if (drr_input_tensors[i]->consumers().size() !=
-          ir_node->operand(i).source().use_count()) {
+          ir_input_values[i].use_count()) {
         matched = false;
         VLOG(8) << drr_node->name() << " Match failed: consumers of drr intput["
                 << i << "] { " << drr_node->outputs().size()
                 << " } != consumers of pir intput[" << i << "] { "
-                << ir_node->operand(i).source().use_count() << " }.";
+                << ir_input_values[i].use_count() << " }.";
         break;
       }
 
-      auto* ir_producer_op = ir_node->operand_source(i).defining_op();
-      // bfs producer_op of current_op
-      if (drr_visited.count(drr_producer_op) &&
-          ir_visited.count(ir_producer_op)) {
-        continue;
+      auto* ir_producer_op = ir_input_values[i].defining_op();
+      // Tigger early stop while operand is BlockArgument with
+      // producer_op==nullptr.
+      if (drr_producer_op && ir_producer_op == nullptr) {
+        matched = false;
+        break;
       }
-
-      if (!drr_visited.count(drr_producer_op) &&
-          !ir_visited.count(ir_producer_op)) {
-        drr_q.push(drr_producer_op);
-        ir_q.push(ir_producer_op);
-        drr_visited.insert(drr_producer_op);
-        ir_visited.insert(ir_producer_op);
-      } else {
+      // bfs producer_op of current_op
+      if (!TryUpdateDrrQueue(drr_producer_op, ir_producer_op)) {
         matched = false;
         VLOG(8) << "Match failed: status of visiting for" << drr_node->name()
                 << " is different.";

From 98f48ba2947739636c18e986f5fadfa8f5041cf5 Mon Sep 17 00:00:00 2001
From: feifei-111 <2364819892@qq.com>
Date: Sat, 2 Mar 2024 10:16:32 +0800
Subject: [PATCH 10/15] [SOT] fix bug in llm stable diffusion (#62257)

---
 .../executor/opcode_executor.py               | 19 ++++-
 .../executor/variables/__init__.py            |  2 +-
 .../executor/variables/callable.py            |  6 +-
 .../instruction_utils/opcode_analysis.py      | 74 ++++++++++++-------
 .../paddle/jit/sot/utils/paddle_api_config.py |  1 -
 test/sot/test_break_graph.py                  | 15 ++++
 6 files changed, 82 insertions(+), 35 deletions(-)

diff --git a/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py b/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py
index 3dfa9fb1b733b3..7f28346922d918 100644
--- a/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py
+++ b/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py
@@ -88,6 +88,7 @@
     TensorVariable,
     TupleVariable,
     UserDefinedFunctionVariable,
+    UserDefinedGeneratorFunctionVariable,
     VariableBase,
     VariableFactory,
 )
@@ -1318,11 +1319,21 @@ def g(z=x):
             default_args,
             closure,
         )
-        self.stack.push(
-            UserDefinedFunctionVariable(
-                new_fn, self._graph, DummyTracker(related_list)
+        # new_fn is created for which is binded with Variables
+        # so new_fn.__module__ is a ConstantVariable
+        # can not use VariableFactory.from_value
+        if inspect.isgeneratorfunction(new_fn):
+            self.stack.push(
+                UserDefinedGeneratorFunctionVariable(
+                    new_fn, self._graph, DummyTracker(related_list)
+                )
+            )
+        else:
+            self.stack.push(
+                UserDefinedFunctionVariable(
+                    new_fn, self._graph, DummyTracker(related_list)
+                )
             )
-        )
 
     def GET_ITER(self, instr: Instruction):
         source_obj = self.stack.pop()
diff --git a/python/paddle/jit/sot/opcode_translator/executor/variables/__init__.py b/python/paddle/jit/sot/opcode_translator/executor/variables/__init__.py
index 989c23e110abd0..3d53d1fce93dc3 100644
--- a/python/paddle/jit/sot/opcode_translator/executor/variables/__init__.py
+++ b/python/paddle/jit/sot/opcode_translator/executor/variables/__init__.py
@@ -44,7 +44,7 @@
     PaddleApiVariable,
     PaddleLayerVariable,
     UserDefinedFunctionVariable,
-    UserDefinedGeneratorVariable,
+    UserDefinedGeneratorFunctionVariable,
     UserDefinedLayerVariable,
 )
 from .container import (  # noqa: F401
diff --git a/python/paddle/jit/sot/opcode_translator/executor/variables/callable.py b/python/paddle/jit/sot/opcode_translator/executor/variables/callable.py
index 0e6ba7ec1e33fd..1648ebcf79b4d8 100644
--- a/python/paddle/jit/sot/opcode_translator/executor/variables/callable.py
+++ b/python/paddle/jit/sot/opcode_translator/executor/variables/callable.py
@@ -681,9 +681,9 @@ def main_info(self) -> dict[str, Any]:
         }
 
 
-class UserDefinedGeneratorVariable(FunctionVariable):
+class UserDefinedGeneratorFunctionVariable(FunctionVariable):
     """
-    UserDefinedGeneratorVariable is a subclass of FunctionVariable used to wrap a user-defined generator.
+    UserDefinedGeneratorFunctionVariable is a subclass of FunctionVariable used to wrap a user-defined generator.
     Args:
         fn (Callable[..., Any]): The user-defined generator to be wrapped.
         graph(FunctionGraph): The FunctionGraph object that this variable is associated with.
@@ -711,7 +711,7 @@ def main_info(self) -> dict[str, Any]:
     )
     def from_value(value: Any, graph: FunctionGraph, tracker: Tracker):
         if inspect.isgeneratorfunction(value):
-            return UserDefinedGeneratorVariable(value, graph, tracker)
+            return UserDefinedGeneratorFunctionVariable(value, graph, tracker)
         return None
 
 
diff --git a/python/paddle/jit/sot/opcode_translator/instruction_utils/opcode_analysis.py b/python/paddle/jit/sot/opcode_translator/instruction_utils/opcode_analysis.py
index 93722f42c9602a..3d7c1cb7d1f46c 100644
--- a/python/paddle/jit/sot/opcode_translator/instruction_utils/opcode_analysis.py
+++ b/python/paddle/jit/sot/opcode_translator/instruction_utils/opcode_analysis.py
@@ -23,21 +23,19 @@
     ALL_JUMP,
     HAS_FREE,
     HAS_LOCAL,
-    RETURN,
     UNCONDITIONAL_JUMP,
 )
 
 
 @dataclasses.dataclass
-class State:
+class NameRecorder:
     reads: OrderedSet[str]
     writes: OrderedSet[str]
-    visited: OrderedSet[int]
 
     def __or__(self, other):
         reads = self.reads | other.reads
         writes = self.writes | other.writes
-        return State(reads, writes, OrderedSet())
+        return NameRecorder(reads, writes)
 
 
 def is_read_opcode(opname):
@@ -90,46 +88,70 @@ def analysis_used_names(
     Returns:
         State: The analysis result.
     """
-    root_state = State(OrderedSet(), OrderedSet(), OrderedSet())
-
-    def fork(state: State, start: int, jump: bool, jump_target: int) -> State:
+    name_recorder = NameRecorder(OrderedSet(), OrderedSet())
+
+    # start idx and writes names can decide the analysis result below
+    # so, just check the pair of (idx, writes), to skip repeat simulation
+    # (writes can decide if a name should be add to reads)
+    # one idx can has multi writes for whom is not subset with each other
+    # if A is subset of B, we just record A, simulate A might add more reads
+    visited_states = {}
+
+    def check_and_update_visited_states(idx, writes):
+        writes = set(writes)
+
+        if idx in visited_states:
+            history = visited_states[idx]
+            for record in history:
+                if record.issubset(writes):
+                    return True
+                elif writes.issubset(record):
+                    history.remove(record)
+                    history.append(writes)
+                    return False
+        else:
+            visited_states[idx] = [writes]
+
+        return False
+
+    def fork(
+        name_recorder: NameRecorder, start: int, jump: bool, jump_target: int
+    ) -> NameRecorder:
         new_start = start + 1 if not jump else jump_target
-        new_state = State(
-            OrderedSet(state.reads),
-            OrderedSet(state.writes),
-            OrderedSet(state.visited),
+        new_state = NameRecorder(
+            OrderedSet(name_recorder.reads),
+            OrderedSet(name_recorder.writes),
         )
         return walk(new_state, new_start)
 
-    def walk(state: State, start: int) -> State:
+    def walk(name_recorder: NameRecorder, start: int) -> NameRecorder:
         end = len(instructions) if stop_instr_idx is None else stop_instr_idx
         for i in range(start, end):
-            if i in state.visited:
-                return state
-            state.visited.add(i)
+            if check_and_update_visited_states(i, name_recorder.writes):
+                return name_recorder
 
             instr = instructions[i]
             if instr.opname in HAS_LOCAL | HAS_FREE:
                 if is_read_opcode(instr.opname) and instr.argval not in (
-                    state.writes
+                    name_recorder.writes
                 ):
-                    state.reads.add(instr.argval)
+                    name_recorder.reads.add(instr.argval)
                 elif is_write_opcode(instr.opname):
-                    state.writes.add(instr.argval)
+                    name_recorder.writes.add(instr.argval)
             elif instr.opname in ALL_JUMP:
                 assert instr.jump_to is not None
                 target_idx = instructions.index(instr.jump_to)
                 # Fork to two branches, jump or not
-                jump_branch = fork(state, i, True, target_idx)
+                jump_branch = fork(name_recorder, i, True, target_idx)
                 not_jump_branch = (
-                    fork(state, i, False, target_idx)
+                    fork(name_recorder, i, False, target_idx)
                     if instr.opname not in UNCONDITIONAL_JUMP
-                    else State(OrderedSet(), OrderedSet(), OrderedSet())
+                    else NameRecorder(OrderedSet(), OrderedSet())
                 )
                 return jump_branch | not_jump_branch
-            elif instr.opname in RETURN:
-                return state
-        return state
+            elif instr.opname == "RETURN_VALUE":
+                return name_recorder
+        return name_recorder
 
-    state = walk(root_state, current_instr_idx)
-    return state.reads, state.writes
+    name_recorder = walk(name_recorder, current_instr_idx)
+    return name_recorder.reads, name_recorder.writes
diff --git a/python/paddle/jit/sot/utils/paddle_api_config.py b/python/paddle/jit/sot/utils/paddle_api_config.py
index 8a5cde9e657160..24b58bda9b83b4 100644
--- a/python/paddle/jit/sot/utils/paddle_api_config.py
+++ b/python/paddle/jit/sot/utils/paddle_api_config.py
@@ -82,7 +82,6 @@ def get_paddle_api():
 # considered as paddle module？
 paddle_api_module_prefix = {
     "paddle.nn.functional",
-    "paddle.nn.layer.activation",
 }
 
 break_graph_set = set()
diff --git a/test/sot/test_break_graph.py b/test/sot/test_break_graph.py
index b6908f4d229b57..58cab6d48b0a3e 100644
--- a/test/sot/test_break_graph.py
+++ b/test/sot/test_break_graph.py
@@ -185,5 +185,20 @@ def test_break_graph_in_layer(self):
         self.assert_results(net.forward, x)
 
 
+def dummy(*args):
+    return None
+
+
+def break_graph_call_generator_function(x):
+    return dummy(y for y in x)
+
+
+class TestBreakGraphCallGeneratorFunction(TestCaseBase):
+    def test_break_graph_when_call_generator_function(self):
+        x = paddle.rand([1], dtype=paddle.float32)
+        y = paddle.rand([1], dtype=paddle.float32)
+        self.assert_results(break_graph_call_generator_function, [x, y])
+
+
 if __name__ == "__main__":
     unittest.main()

From eabf863247fef18d5d7912817c9a1a95d3ddf23f Mon Sep 17 00:00:00 2001
From: Nyakku Shigure <sigure.qaq@gmail.com>
Date: Sat, 2 Mar 2024 11:02:44 +0800
Subject: [PATCH 11/15] [Dy2St][PIR] Add view op to inplace info (#62300)

---
 paddle/fluid/pybind/pir.cc                  |  5 ++
 test/dygraph_to_static/test_deal_inplace.py | 53 +++++++++++++++++++++
 2 files changed, 58 insertions(+)
 create mode 100644 test/dygraph_to_static/test_deal_inplace.py

diff --git a/paddle/fluid/pybind/pir.cc b/paddle/fluid/pybind/pir.cc
index 45fe7263e692c5..d28b2743482016 100644
--- a/paddle/fluid/pybind/pir.cc
+++ b/paddle/fluid/pybind/pir.cc
@@ -1360,7 +1360,12 @@ std::map<int, int> GetOpInplaceInfo(const pir::Operation *op) {
       const std::string &inplace_name = yaml_parser.InplaceName(value_name);
       inplace_info[i] = yaml_parser.InputName2Id().at(inplace_name);
     }
+    if (yaml_parser.HasView(value_name)) {
+      const std::string &view_name = yaml_parser.ViewName(value_name);
+      inplace_info[i] = yaml_parser.InputName2Id().at(view_name);
+    }
   }
+
   return inplace_info;
 }
 
diff --git a/test/dygraph_to_static/test_deal_inplace.py b/test/dygraph_to_static/test_deal_inplace.py
new file mode 100644
index 00000000000000..3984dd729db0a2
--- /dev/null
+++ b/test/dygraph_to_static/test_deal_inplace.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from dygraph_to_static_utils import (
+    Dy2StTestBase,
+    test_pir_only,
+)
+
+import paddle
+
+
+def fn_with_inplace_op(inplace_op, x):
+    y = inplace_op(x)
+    z = inplace_op(x)
+    return y + z
+
+
+class TestDealInplace(Dy2StTestBase):
+    def run_test(self, dygraph_fn, *inputs):
+        dygraph_out = dygraph_fn(*inputs)
+        static_fn = paddle.jit.to_static(dygraph_fn)
+        static_out = static_fn(*inputs)
+        np.testing.assert_allclose(dygraph_out.numpy(), static_out.numpy())
+
+    @test_pir_only
+    def test_deal_view(self):
+        bn_layer = paddle.nn.BatchNorm2D(10)
+        x = paddle.to_tensor(np.random.random((2, 10, 3, 3)).astype('float32'))
+        self.run_test(fn_with_inplace_op, bn_layer, x)
+
+    @test_pir_only
+    def test_deal_inplace(self):
+        sigmoid_layer = paddle.nn.Sigmoid()
+        x = paddle.to_tensor(np.random.random((2, 10, 3, 3)).astype('float32'))
+        self.run_test(fn_with_inplace_op, sigmoid_layer, x)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 6f608ca9d2c84db75e7bff4ce7a9be9a321a1fba Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Sat, 2 Mar 2024 12:31:30 +0800
Subject: [PATCH 12/15] [PT] Set NCHW as default Layout for type translator
 (#62263)

* [PT] Set NCHW as default Layout for type translator

* fix randint

* fix typo

* fix delt
---
 .../ir_adaptor/translator/op_translator.cc    |  2 +-
 .../ir_adaptor/translator/type_translator.cc  | 89 +++++++++----------
 2 files changed, 44 insertions(+), 47 deletions(-)

diff --git a/paddle/fluid/ir_adaptor/translator/op_translator.cc b/paddle/fluid/ir_adaptor/translator/op_translator.cc
index bf5acda9c1bbd3..3466c074ed9948 100644
--- a/paddle/fluid/ir_adaptor/translator/op_translator.cc
+++ b/paddle/fluid/ir_adaptor/translator/op_translator.cc
@@ -2746,7 +2746,7 @@ struct RandIntOpTranscriber : public OpTranscriber {
     paddle::dialect::DenseTensorTypeStorage::Dim dim =
         common::make_ddim(var->GetShape());
     paddle::dialect::DenseTensorTypeStorage::DataLayout layout =
-        paddle::dialect::DenseTensorTypeStorage::DataLayout::UNDEFINED;
+        paddle::dialect::DenseTensorTypeStorage::DataLayout::NCHW;
     paddle::dialect::DenseTensorTypeStorage::LoD lod = {};
     size_t offset = 0;
     pir::Type translated_var_type = paddle::dialect::DenseTensorType::get(
diff --git a/paddle/fluid/ir_adaptor/translator/type_translator.cc b/paddle/fluid/ir_adaptor/translator/type_translator.cc
index 7cd297cf46b62d..4378ef5285ceb0 100644
--- a/paddle/fluid/ir_adaptor/translator/type_translator.cc
+++ b/paddle/fluid/ir_adaptor/translator/type_translator.cc
@@ -30,8 +30,48 @@ using DenseTensorType = paddle::dialect::DenseTensorType;
 using DenseTensorTypeStorage = paddle::dialect::DenseTensorTypeStorage;
 using SelectedRowsType = paddle::dialect::SelectedRowsType;
 using SelectedRowsTypeStorage = paddle::dialect::SelectedRowsTypeStorage;
+using DataLayout = DenseTensorTypeStorage::DataLayout;
+using LoD = DenseTensorTypeStorage::LoD;
 
 TypeTranslator::TypeTranslator() {
+  const auto& HandleTensor = [&](pir::IrContext* ctx,
+                                 const VarDesc& var_desc) -> pir::Type {
+    VLOG(10) << "[vartype translating]"
+             << "[" << var_desc.Name() << "] from LOD_TENSOR";
+    const pir::Type dtype =
+        this->operator[](var_desc.GetDataType())(ctx, var_desc);
+    const auto dim = common::make_ddim(var_desc.GetShape());
+    const auto layout = DataLayout::NCHW;
+    const LoD lod = {};
+    const size_t offset = 0;
+    return DenseTensorType::get(ctx, dtype, dim, layout, lod, offset);
+  };
+  const auto& HandleTensorArray = [&](pir::IrContext* ctx,
+                                      const VarDesc& var_desc) -> pir::Type {
+    VLOG(10) << "[vartype translating]"
+             << "[" << var_desc.Name() << "] from LOD_TENSOR_ARRAY";
+    const pir::Type dtype =
+        this->operator[](var_desc.GetDataType())(ctx, var_desc);
+    const auto dims = common::make_ddim(var_desc.GetShape());
+    const auto layout = DataLayout::NCHW;
+    return paddle::dialect::DenseTensorArrayType::get(ctx, dtype, dims, layout);
+  };
+
+  const auto& HandleSelectedRows = [&](pir::IrContext* ctx,
+                                       const VarDesc& var_desc) -> pir::Type {
+    VLOG(10) << "[vartype translating]"
+             << "[" << var_desc.Name() << "] from SELECTED_ROWS";
+    const pir::Type dtype =
+        this->operator[](var_desc.GetDataType())(ctx, var_desc);
+    const auto dim = common::make_ddim(var_desc.GetShape());
+    const auto layout = DataLayout::NCHW;
+    const LoD lod = {};
+    const size_t offset = 0;
+    pir::Type SelectedRows =
+        SelectedRowsType::get(ctx, dtype, dim, layout, lod, offset);
+    return SelectedRows;
+  };
+
   handlers = {
       {VarType::BOOL,
        [&](pir::IrContext* ctx, const VarDesc& var_desc) -> pir::Type {
@@ -81,52 +121,9 @@ TypeTranslator::TypeTranslator() {
        [&](pir::IrContext* ctx, const VarDesc& var_desc) -> pir::Type {
          return pir::Complex128Type::get(ctx);
        }},
-      {VarType::LOD_TENSOR,
-       [&](pir::IrContext* ctx, const VarDesc& var_desc) -> pir::Type {
-         VLOG(10) << "[vartype translating]"
-                  << "[" << var_desc.Name() << "] from LOD_TENSOR";
-
-         pir::Type dtype =
-             this->operator[](var_desc.GetDataType())(ctx, var_desc);
-         DenseTensorTypeStorage::Dim dim =
-             common::make_ddim(var_desc.GetShape());
-         DenseTensorTypeStorage::DataLayout layout =
-             DenseTensorTypeStorage::DataLayout::UNDEFINED;
-         DenseTensorTypeStorage::LoD lod = {};
-         size_t offset = 0;
-         return DenseTensorType::get(ctx, dtype, dim, layout, lod, offset);
-       }},
-      {VarType::LOD_TENSOR_ARRAY,
-       [&](pir::IrContext* ctx, const VarDesc& var_desc) -> pir::Type {
-         VLOG(10) << "[vartype translating]"
-                  << "[" << var_desc.Name() << "] from LOD_TENSOR_ARRAY";
-         pir::Type dtype =
-             this->operator[](var_desc.GetDataType())(ctx, var_desc);
-         phi::DDim dims = common::make_ddim(var_desc.GetShape());
-         DenseTensorTypeStorage::DataLayout layout =
-             DenseTensorTypeStorage::DataLayout::UNDEFINED;
-
-         return paddle::dialect::DenseTensorArrayType::get(
-             ctx, dtype, dims, layout);
-       }},
-      {VarType::SELECTED_ROWS,
-       [&](pir::IrContext* ctx, const VarDesc& var_desc) -> pir::Type {
-         VLOG(10) << "[vartype translating]"
-                  << "[" << var_desc.Name() << "] from SELECTED_ROWS";
-
-         pir::Type dtype =
-             this->operator[](var_desc.GetDataType())(ctx, var_desc);
-
-         SelectedRowsTypeStorage::Dim dim =
-             common::make_ddim(var_desc.GetShape());
-         SelectedRowsTypeStorage::DataLayout layout =
-             SelectedRowsTypeStorage::DataLayout::UNDEFINED;
-         SelectedRowsTypeStorage::LoD lod = {};
-         size_t offset = 0;
-         pir::Type SelectedRows =
-             SelectedRowsType::get(ctx, dtype, dim, layout, lod, offset);
-         return SelectedRows;
-       }},
+      {VarType::LOD_TENSOR, HandleTensor},
+      {VarType::LOD_TENSOR_ARRAY, HandleTensorArray},
+      {VarType::SELECTED_ROWS, HandleSelectedRows},
   };
 }
 

From 94018aecdeddb4169232655631f5b1cc762f8c8f Mon Sep 17 00:00:00 2001
From: hong <43953930+phlrain@users.noreply.github.com>
Date: Sat, 2 Mar 2024 12:38:16 +0800
Subject: [PATCH 13/15] [CINN]Fix group op attribuge hash bug (#62309)

* fix group op attribute hash bug

* fix bug
---
 paddle/cinn/hlir/dialect/operator/ir/attribute_storage.h     | 5 +++++
 .../dialect/operator/transforms/cinn_group_cluster_pass.cc   | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/paddle/cinn/hlir/dialect/operator/ir/attribute_storage.h b/paddle/cinn/hlir/dialect/operator/ir/attribute_storage.h
index 61a2ae3268e05d..d338dcd84b04d1 100644
--- a/paddle/cinn/hlir/dialect/operator/ir/attribute_storage.h
+++ b/paddle/cinn/hlir/dialect/operator/ir/attribute_storage.h
@@ -71,6 +71,11 @@ struct GroupInfoAttributeStorage : public pir::AttributeStorage {
   static std::size_t HashValue(const ParamKey& key) {
     size_t hash_value = std::hash<std::string>{}(key.group_id);
 
+    for (auto op : key.ops) {
+      hash_value =
+          pir::detail::hash_combine(hash_value, std::hash<void*>()(op));
+    }
+
     for (auto d : key.loop_ranges) {
       hash_value =
           pir::detail::hash_combine(hash_value, std::hash<int64_t>()(d));
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/cinn_group_cluster_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/cinn_group_cluster_pass.cc
index 9f9856004646fc..f0069a55a4cdee 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/cinn_group_cluster_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/cinn_group_cluster_pass.cc
@@ -252,7 +252,7 @@ cinn::dialect::GroupInfo BuildGroupInfo(
     const GroupClusterNode& node,
     const std::unordered_map<::pir::Operation*, std::vector<ScheduleInfoNode>>&
         new_align_info) {
-  cinn::dialect::GroupInfo group_info({});
+  cinn::dialect::GroupInfo group_info(vec_new_op_list);
   group_info.group_id = BuildGroupId(vec_new_op_list);
   group_info.loop_ranges = node.loop_ranges;
   group_info.reduce_axis = node.reduce_axis;

From 8b4219b0b84b42df40ebb439440ce5445d769884 Mon Sep 17 00:00:00 2001
From: Bo Zhang <105368690+zhangbopd@users.noreply.github.com>
Date: Sat, 2 Mar 2024 15:10:35 +0800
Subject: [PATCH 14/15] add argmax & argmin (#62312)

---
 .../infer_symbolic_shape/infer_sym_utils.h    |   3 +
 .../infer_symbolic_shape.h                    |   1 +
 .../paddle_op_infer_sym.cc                    |  13 --
 .../paddle_op_infer_sym.h                     |   5 -
 .../infer_symbolic_shape/unary_infer_sym.cc   |  77 ++++++++++++
 .../infer_symbolic_shape/unary_infer_sym.h    |  26 ++++
 .../pir/transforms/shape_optimization_pass.cc |   4 +-
 .../symbolic/test_unary_op_infer_sym_shape.py | 112 ++++++++++++++++++
 8 files changed, 220 insertions(+), 21 deletions(-)
 create mode 100644 paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.cc
 create mode 100644 paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.h
 create mode 100644 test/ir/pir/cinn/symbolic/test_unary_op_infer_sym_shape.py

diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h
index d2d508ff5890db..f5193b3f7ff5b5 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h
@@ -17,6 +17,9 @@
 #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
 #include "paddle/pir/include/dialect/shape/utils/shape_analysis.h"
 
+#define GET_BOOL_ATTR(op, str) \
+  op->attributes().at(str).dyn_cast<pir::BoolAttribute>().data();
+
 // To make codes shorter
 using ExprVec = std::vector<symbol::DimExpr>;
 using ShapeOrData = symbol::ShapeOrDataDimExprs;
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_symbolic_shape.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_symbolic_shape.h
index 4e1946acd75f1a..515eaaca1b3484 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_symbolic_shape.h
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_symbolic_shape.h
@@ -18,6 +18,7 @@
 #include "paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_element_wise_binary.h"
 #include "paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.h"
 #include "paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_and_result.h"
+#include "paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.h"
 #include "paddle/pir/include/dialect/shape/utils/shape_analysis.h"
 
 // Type inference is currently modelled executionally for operation creation
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc
index 1be26c82f4c21a..d7ee4fb6781b0f 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc
@@ -1174,19 +1174,6 @@ bool SplitOpInferSymbolicShape(pir::Operation *op,
 
 //  Not Impelmented Ops.
 
-bool ArgmaxOpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
-  return true;
-}
-bool ArgminOpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
-  return true;
-}
-
 bool AsComplexOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
   PADDLE_THROW(phi::errors::Unimplemented(
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.h
index cf5e650023fa95..f23e84c27f55d5 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.h
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.h
@@ -114,11 +114,6 @@ bool SplitOpInferSymbolicShape(pir::Operation *op,
 
 //  Not Impelmented Ops.
 
-bool ArgmaxOpInferSymbolicShape(pir::Operation *op,
-                                pir::ShapeConstraintIRAnalysis *shape_analysis);
-bool ArgminOpInferSymbolicShape(pir::Operation *op,
-                                pir::ShapeConstraintIRAnalysis *shape_analysis);
-
 bool AsComplexOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
 bool AsRealOpInferSymbolicShape(pir::Operation *op,
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.cc
new file mode 100644
index 00000000000000..d82fc12521998a
--- /dev/null
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.cc
@@ -0,0 +1,77 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.h"
+#include "paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h"
+// #include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
+
+namespace paddle::dialect {
+
+bool ArgmaxOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  bool flatten = GET_BOOL_ATTR(op, "flatten");
+  bool keepdims = GET_BOOL_ATTR(op, "keepdims");
+
+  const auto &input_shape_or_data =
+      shape_analysis->GetShapeOrDataForValue(op->operand_source(0));
+
+  const auto &axis_shape_or_data =
+      shape_analysis->GetShapeOrDataForValue(op->operand_source(1));
+  int axis =
+      static_cast<int>(axis_shape_or_data.data().value()[0].Get<int64_t>());
+
+  const std::vector<symbol::DimExpr> &input_sym_shape =
+      input_shape_or_data.data().has_value()
+          ? input_shape_or_data.data().value()
+          : input_shape_or_data.shape();
+
+  int rank = input_sym_shape.size();
+  if (axis < 0) axis += rank;
+
+  const auto &out_sym_shape = [&] {
+    std::vector<symbol::DimExpr> out_sym_shape;
+    if (flatten) {
+      if (keepdims) {
+        out_sym_shape.emplace_back(std::int64_t(rank));
+      } else {
+        out_sym_shape.emplace_back(std::int64_t(0));
+      }
+    } else {
+      for (int i = 0; i < axis; i++) {
+        out_sym_shape.emplace_back(input_sym_shape[i]);
+      }
+      if (keepdims) {
+        out_sym_shape.emplace_back(std::int64_t(1));
+      }
+
+      for (int i = axis + 1; i < rank; i++) {
+        out_sym_shape.emplace_back(input_sym_shape[i]);
+      }
+    }
+    return out_sym_shape;
+  }();
+
+  symbol::ShapeOrDataDimExprs shape_data{
+      symbol::TensorShapeOrDataDimExprs(out_sym_shape)};
+
+  shape_analysis->SetShapeOrDataForValue(op->result(0), shape_data);
+  return true;
+}
+
+bool ArgminOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return ArgmaxOpInferSymbolicShape(op, shape_analysis);
+}
+
+}  // namespace paddle::dialect
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.h
new file mode 100644
index 00000000000000..832a6a7a074c36
--- /dev/null
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/pir/include/dialect/shape/utils/shape_analysis.h"
+
+namespace paddle::dialect {
+
+bool ArgmaxOpInferSymbolicShape(pir::Operation *op,
+                                pir::ShapeConstraintIRAnalysis *shape_analysis);
+bool ArgminOpInferSymbolicShape(pir::Operation *op,
+                                pir::ShapeConstraintIRAnalysis *shape_analysis);
+
+}  // namespace paddle::dialect
diff --git a/paddle/fluid/pir/transforms/shape_optimization_pass.cc b/paddle/fluid/pir/transforms/shape_optimization_pass.cc
index d9cf96f78efe99..85f4a5a5eef498 100644
--- a/paddle/fluid/pir/transforms/shape_optimization_pass.cc
+++ b/paddle/fluid/pir/transforms/shape_optimization_pass.cc
@@ -23,7 +23,7 @@
 
 COMMON_DECLARE_bool(pir_apply_shape_optimization_pass);
 
-const int vlog_level = 3;
+constexpr int vlog_level = 3;
 
 namespace pir {
 namespace {
@@ -144,8 +144,6 @@ void InferSymExprForBlock(const Block& block,
             &op, shape_analysis->GetShapeOrDataForValue(op.result(0)));
       }
     } else {
-      VLOG(vlog_level) << op.name() +
-                              " DOES NOT have InferSymbolicShapeInterface!";
       PADDLE_THROW(phi::errors::Unimplemented(
           op.name() + " DOES NOT have InferSymbolicShapeInterface!"));
     }
diff --git a/test/ir/pir/cinn/symbolic/test_unary_op_infer_sym_shape.py b/test/ir/pir/cinn/symbolic/test_unary_op_infer_sym_shape.py
new file mode 100644
index 00000000000000..5260475b45f1e8
--- /dev/null
+++ b/test/ir/pir/cinn/symbolic/test_unary_op_infer_sym_shape.py
@@ -0,0 +1,112 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle.static import InputSpec
+
+
+def get_sym_shape_str_for_op(net, input_spec, op_name='builtin.shadow_output'):
+    forward_program = net.forward.get_concrete_program(*input_spec)[
+        1
+    ].infer_program.forward_program
+    all_sym_shape_str = []
+    for op in forward_program.global_block().ops:
+        if op.name() == op_name:
+            all_sym_shape_str.append(op.attrs()['sym_shape_str'])
+
+    return all_sym_shape_str
+
+
+def apply_to_static(net, use_cinn, input_spec=None):
+    build_strategy = paddle.static.BuildStrategy()
+    build_strategy.build_cinn_pass = use_cinn
+    return paddle.jit.to_static(
+        net,
+        input_spec=input_spec,
+        build_strategy=build_strategy,
+        full_graph=True,
+    )
+
+
+class TestBase(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2022)
+        self.prepare_data()
+
+    def prepare_data(self):
+        pass
+
+    def test_eval_symbolic(self):
+        pass
+
+
+class ArgMaxMinNet(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        argmax_out = paddle.argmax(x)
+        argmin_out = paddle.argmin(x, axis=-1)
+        return argmax_out, argmin_out
+
+
+class TestArgMaxMinOpInferSymbolicShape(TestBase):
+    def prepare_data(self):
+        self.cases = [np.random.rand(4, 5, 6)]
+        self.expected = [
+            [
+                'shape[0], data[NULL]',
+                'shape[S0, S1], data[NULL]',
+            ]
+        ]
+
+    def test_eval_symbolic(self):
+        net = ArgMaxMinNet()
+
+        for i in range(len(self.cases)):
+            x = self.cases[i]
+            x_spec = InputSpec(
+                shape=[None for index in range(len(x.shape))], dtype='float32'
+            )
+
+            input_spec = [x_spec]
+            net = apply_to_static(net, False, input_spec)
+            net.eval()
+
+            # check the infer result
+            sym_shape_str_list = get_sym_shape_str_for_op(
+                net, input_spec, 'pd_op.argmax'
+            )
+            sym_shape_str_list += get_sym_shape_str_for_op(
+                net, input_spec, 'pd_op.argmin'
+            )
+            np.testing.assert_equal(
+                len(sym_shape_str_list), len(self.expected[i])
+            )
+            for j in range(len(sym_shape_str_list)):
+                np.testing.assert_equal(
+                    sym_shape_str_list[j].find(self.expected[i][j]),
+                    0,
+                    f'in case i,j = {i},{j}: output shape ({sym_shape_str_list[0]}) is not expected {(self.expected[i][j])}',
+                )
+
+        return True
+
+
+if __name__ == '__main__':
+    unittest.main()

From 6fccb8f20c283abcbf28d0ed7e82be9c83e7ce45 Mon Sep 17 00:00:00 2001
From: xiongkun <xiongkun03@baidu.com>
Date: Sat, 2 Mar 2024 17:09:09 +0800
Subject: [PATCH 15/15] [CINN] uniform all the 0 and reduce deleted axis
 (#61608)

* uniform all the 0 and reduce deleted axis

* remove one shape for keepdim cases.

* fix by code review

* fix some error in 0d format
---
 paddle/cinn/ast_gen_ius/ast_gen.cc | 86 +++++++++++++++++++++++++-----
 paddle/cinn/hlir/pe/reduction.cc   |  8 +++
 paddle/cinn/ir/ir.cc               |  5 +-
 paddle/cinn/ir/ir.h                | 15 ++++--
 paddle/cinn/lang/compute.cc        |  7 +++
 paddle/cinn/pybind/ir/ir_api.cc    |  1 +
 paddle/cinn/runtime/flags.cc       |  4 ++
 7 files changed, 107 insertions(+), 19 deletions(-)

diff --git a/paddle/cinn/ast_gen_ius/ast_gen.cc b/paddle/cinn/ast_gen_ius/ast_gen.cc
index 009158d3f9cce2..57b10fb7ca8849 100644
--- a/paddle/cinn/ast_gen_ius/ast_gen.cc
+++ b/paddle/cinn/ast_gen_ius/ast_gen.cc
@@ -22,6 +22,7 @@
 #include "paddle/cinn/optim/replace_var_with_expr.h"
 
 PD_DECLARE_bool(cinn_new_group_scheduler);
+PD_DECLARE_bool(group_schedule_tiling_first);
 PD_DECLARE_bool(cinn_bucket_compile);
 
 namespace cinn {
@@ -93,9 +94,21 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) {
     std::vector<ir::Expr> iter_values;
     // reduce body and reduce init schedule block should have different objects
     // for same axis so we re-create objects
+    VLOG(4) << "FLAGS_group_schedule_tiling_first = "
+            << FLAGS_group_schedule_tiling_first;
     std::vector<Var> axis_vars = cinn::common::GenDefaultAxis(axis_len);
+    const std::vector<ir::Var>& reduce_axis = tensor->reduce_axis;
+    VLOG(4) << "ast gen: tensor init_body is " << init_body;
     for (int i = 0; i < shape.size(); ++i) {
-      if (FLAGS_cinn_new_group_scheduler && shape[i] == Expr(1)) {
+      bool is_keep_dim = axis[i]->is_keepdim;
+      if (FLAGS_group_schedule_tiling_first && is_keep_dim) {
+        // if tiling first, we need to replace the reduce axis with 0, but don't
+        // deal with the non-reduce axis
+        optim::ReplaceVarWithExpr(&init_body, axis[i], Expr(0));
+        continue;
+      }
+      if (!FLAGS_group_schedule_tiling_first &&
+          FLAGS_cinn_new_group_scheduler && shape[i] == Expr(1)) {
         optim::ReplaceVarWithExpr(&init_body, axis[i], Expr(0));
         continue;
       }
@@ -105,21 +118,25 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) {
                                /*is_reduce = */ false));
       optim::ReplaceVarWithExpr(&init_body, axis[i], block_vars.back());
       axis_vars[i]->is_reduce_axis = false;
-      if (shape[i] == Expr(1)) {
+      if (!FLAGS_group_schedule_tiling_first && shape[i] == Expr(1)) {
         iter_values.push_back(Expr(0));
       } else {
         iter_values.push_back(axis_vars[i]);
       }
     }
+    VLOG(4) << "iter_value.size() and block_vars.size() is "
+            << iter_values.size() << " " << block_vars.size();
     init_body = ir::ScheduleBlockRealize::Make(
         iter_values,
         ir::ScheduleBlock::Make(
             block_vars, {}, {}, reduce_init_name, init_body));
 
     // For the remaining reduce axis, make reduce body
-    const std::vector<ir::Var>& reduce_axis = tensor->reduce_axis;
     ir::Expr reduce_body =
         ConvertReduceBody(tensor->body(), tensor, axis_exprs);
+
+    VLOG(4) << "ast gen: reduce body is " << reduce_body;
+
     // create schedule block itervars, i0,i1...
     std::vector<ir::Var> reduce_block_vars;
     std::vector<ir::Expr> reduce_iter_values;
@@ -127,7 +144,15 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) {
     // for same axis so we re-create objects
     std::vector<Var> reduce_axis_vars = cinn::common::GenDefaultAxis(axis_len);
     for (int i = 0; i < shape.size(); ++i) {
-      if (FLAGS_cinn_new_group_scheduler && shape[i] == Expr(1)) {
+      bool is_keep_dim = axis[i]->is_keepdim;
+      if (FLAGS_group_schedule_tiling_first && is_keep_dim) {
+        // if tiling first, we need to replace the reduce axis with 0, but don't
+        // deal with the non-reduce axis
+        optim::ReplaceVarWithExpr(&reduce_body, axis[i], Expr(0));
+        continue;
+      }
+      if (!FLAGS_group_schedule_tiling_first &&
+          FLAGS_cinn_new_group_scheduler && shape[i] == Expr(1)) {
         optim::ReplaceVarWithExpr(&reduce_body, axis[i], Expr(0));
         continue;
       }
@@ -136,12 +161,13 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) {
                                       cinn::UniqName("i" + std::to_string(i)),
                                       /*is_reduce = */ false));
       reduce_axis_vars[i]->is_reduce_axis = false;
-      if (shape[i] == Expr(1)) {
+      if (!FLAGS_group_schedule_tiling_first && shape[i] == Expr(1)) {
         reduce_iter_values.push_back(Expr(0));
       } else {
         reduce_iter_values.push_back(axis_vars[i]);
       }
     }
+    VLOG(4) << "ast gen: reduce body is after replace 0" << reduce_body;
     for (int i = 0; i < reduce_axis.size(); ++i) {
       int count = shape.size() + i;
       reduce_block_vars.push_back(
@@ -155,14 +181,43 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) {
     }
 
     int non_zero_axis_size = 0;
-    for (int i = 0; i < axis.size(); ++i) {
-      if (FLAGS_cinn_new_group_scheduler && shape[i] == Expr(1)) {
-        continue;
+    if (FLAGS_group_schedule_tiling_first) {
+      std::vector<ir::Var> non_reduce_axis_vars = [&]() {
+        std::vector<ir::Var> res;
+        for (int i = 0; i < shape.size(); ++i) {
+          bool is_keep_dim = axis[i]->is_keepdim;
+          if (!is_keep_dim) {
+            res.push_back(axis[i]);
+          }
+        }
+        return res;
+      }();
+      for (int i = 0; i < non_reduce_axis_vars.size(); ++i) {
+        optim::ReplaceVarWithExpr(
+            &reduce_body, non_reduce_axis_vars[i], reduce_block_vars[i]);
+        ++non_zero_axis_size;
       }
-      optim::ReplaceVarWithExpr(
-          &reduce_body, axis[i], reduce_block_vars[non_zero_axis_size]);
-      ++non_zero_axis_size;
+    } else {
+      for (int i = 0; i < axis.size(); ++i) {
+        if (!FLAGS_group_schedule_tiling_first &&
+            FLAGS_cinn_new_group_scheduler && shape[i] == Expr(1)) {
+          continue;
+        }
+        optim::ReplaceVarWithExpr(
+            &reduce_body, axis[i], reduce_block_vars[non_zero_axis_size]);
+        ++non_zero_axis_size;
+      }
+    }
+
+    VLOG(4) << "to replace : " << non_zero_axis_size << " "
+            << reduce_block_vars.size();
+    for (auto i = 0; i < reduce_block_vars.size(); i++) {
+      VLOG(4) << "reduce_block_vars[" << i << "] = " << reduce_block_vars[i];
+    }
+    for (auto i = 0; i < reduce_axis.size(); i++) {
+      VLOG(4) << "reduce_axis[" << i << "] = " << reduce_axis[i];
     }
+    VLOG(4) << "before replace body: " << reduce_body;
     for (int i = non_zero_axis_size; i < reduce_block_vars.size(); ++i) {
       optim::ReplaceVarWithExpr(&reduce_body,
                                 reduce_axis[i - non_zero_axis_size],
@@ -185,7 +240,12 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) {
     // Put the two parts together
     ir::Expr body = ir::Block::Make({init_body, reduce_body});
     for (int i = static_cast<int>(axis_len) - 1; i >= 0; --i) {
-      if (!FLAGS_cinn_bucket_compile && shape[i] == Expr(1)) {
+      bool is_keep_dim = axis[i]->is_keepdim;
+      if (FLAGS_group_schedule_tiling_first && is_keep_dim) {
+        continue;
+      }
+      if (!FLAGS_group_schedule_tiling_first && !FLAGS_cinn_bucket_compile &&
+          shape[i] == Expr(1)) {
         continue;
       }
       ir::Var loop_var = axis[i];
@@ -210,7 +270,7 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) {
           Expr(0), shape[i], cinn::UniqName("i" + std::to_string(i)), false));
       optim::ReplaceVarWithExpr(&body, axis[i], block_vars[i]);
       axis_vars[i]->is_reduce_axis = false;
-      if (shape[i] == Expr(1)) {
+      if (!FLAGS_group_schedule_tiling_first && shape[i] == Expr(1)) {
         iter_values.push_back(Expr(0));
       } else {
         iter_values.push_back(axis_vars[i]);
diff --git a/paddle/cinn/hlir/pe/reduction.cc b/paddle/cinn/hlir/pe/reduction.cc
index 7e33a1475e48b3..605a1b3d6443fe 100644
--- a/paddle/cinn/hlir/pe/reduction.cc
+++ b/paddle/cinn/hlir/pe/reduction.cc
@@ -166,6 +166,14 @@ Tensor DoReduce(const Tensor& tensor,
     int indice_cnt = 0;
     int reduce_cnt = 0;
 
+    // Set keepdim flags of indices.
+    if (tensor->shape.size() == indices.size()) {
+      for (const auto& i : real_axes) {
+        VLOG(4) << "Set is_keepdim = true for var(" << i << ")";
+        indices[i].as_var_ref()->is_keepdim = true;
+      }
+    }
+
     for (size_t i = 0; i < tensor->shape.size(); ++i) {
       bool squeeze_i = std::find(squeeze_axes.begin(), squeeze_axes.end(), i) !=
                        squeeze_axes.end();
diff --git a/paddle/cinn/ir/ir.cc b/paddle/cinn/ir/ir.cc
index 2e194200d19937..f3c64790551cac 100644
--- a/paddle/cinn/ir/ir.cc
+++ b/paddle/cinn/ir/ir.cc
@@ -218,11 +218,13 @@ Expr _Var_::Make(Expr lower_bound,
                  Expr upper_bound,
                  const std::string &name,
                  bool is_reduce_axis,
-                 bool is_symbolic_constant) {
+                 bool is_symbolic_constant,
+                 bool is_keepdim) {
   auto *n = make_shared<_Var_>();
   n->lower_bound = lower_bound;
   n->upper_bound = upper_bound;
   n->is_reduce_axis = is_reduce_axis;
+  n->is_keepdim = is_keepdim;
   n->is_symbolic_constant = is_symbolic_constant;
   n->name = name;
   n->set_type(lower_bound.type());
@@ -233,6 +235,7 @@ Expr _Var_::Copy() const {
   auto *n = make_shared<_Var_>();
   n->name = name;
   n->is_reduce_axis = is_reduce_axis;
+  n->is_keepdim = is_keepdim;
   n->lower_bound = lower_bound;
   n->upper_bound = upper_bound;
   n->set_type(type());
diff --git a/paddle/cinn/ir/ir.h b/paddle/cinn/ir/ir.h
index c02517f9836fc3..5a1f9f6a1f739f 100644
--- a/paddle/cinn/ir/ir.h
+++ b/paddle/cinn/ir/ir.h
@@ -381,6 +381,7 @@ struct _Var_ : public ExprNode<_Var_> {
   std::string name;
 
   bool is_reduce_axis{false};
+  bool is_keepdim{false};
   bool is_symbolic_constant{false};
   //! Lower bound and upper bound of a axis.
   // @{
@@ -401,7 +402,8 @@ struct _Var_ : public ExprNode<_Var_> {
                    Expr upper_bound,
                    const std::string& name,
                    bool is_reduce,
-                   bool is_symbolic_constant = false);
+                   bool is_symbolic_constant = false,
+                   bool is_keepdim = false);
 
   void Verify() const override;
 
@@ -419,12 +421,14 @@ struct Var : public IrNodeRef {
   Var(Expr lower_bound,
       Expr upper_bound,
       const std::string& name,
-      bool is_reduce = false)
-      : Var(_Var_::Make(lower_bound, upper_bound, name, is_reduce)) {}
+      bool is_reduce = false,
+      bool is_keepdim = false)
+      : Var(_Var_::Make(
+            lower_bound, upper_bound, name, is_reduce, false, is_keepdim)) {}
   Var(int upper_bound, const std::string& name)
-      : Var(_Var_::Make(Expr(0), Expr(upper_bound), name, false)) {}
+      : Var(_Var_::Make(Expr(0), Expr(upper_bound), name, false, false)) {}
   Var(Expr upper_bound, const std::string& name)
-      : Var(_Var_::Make(Expr(0), upper_bound, name, false)) {}
+      : Var(_Var_::Make(Expr(0), upper_bound, name, false, false)) {}
 
   operator Expr() { return Expr(get()); }
   operator Expr() const {
@@ -977,6 +981,7 @@ struct ScheduleBlock : public ExprNode<ScheduleBlock> {
   std::map<std::string, attr_t> attrs;
   std::string name;
   Expr body;
+  int32_t reduce_type{-1};  // 0 for warp reduce, 1 for block reduce
 
   static Expr Make(const std::vector<Var>& iter_vars,
                    const std::vector<Expr>& read_buffers,
diff --git a/paddle/cinn/lang/compute.cc b/paddle/cinn/lang/compute.cc
index 4828eaac64e13c..bd195fd26a6390 100644
--- a/paddle/cinn/lang/compute.cc
+++ b/paddle/cinn/lang/compute.cc
@@ -187,6 +187,13 @@ ir::Tensor Compute(const std::vector<Expr> &domain,
                            domain_without_reduce_axis,
                            op,
                            reduce_axis);
+  const auto set_keep_dim_for_tensor = [&]() {
+    for (int i = 0; i < _axis.size(); ++i) {
+      const auto &axis_var = _axis.at(i);
+      tensor->axis_[i]->is_keepdim = axis_var.as_var_ref()->is_keepdim;
+    }
+  };
+  set_keep_dim_for_tensor();
   return tensor;
 }
 
diff --git a/paddle/cinn/pybind/ir/ir_api.cc b/paddle/cinn/pybind/ir/ir_api.cc
index 56dff498dd7101..efebf1206a8674 100644
--- a/paddle/cinn/pybind/ir/ir_api.cc
+++ b/paddle/cinn/pybind/ir/ir_api.cc
@@ -383,6 +383,7 @@ void BindIrIr(py::module *m) {
                                     ir::Expr,
                                     const std::string &,
                                     bool,
+                                    bool,
                                     bool>(&ir::_Var_::Make))
       .def("copy", &ir::_Var_::Copy);
 
diff --git a/paddle/cinn/runtime/flags.cc b/paddle/cinn/runtime/flags.cc
index 89512913e8fa98..c9f0760d43e80b 100644
--- a/paddle/cinn/runtime/flags.cc
+++ b/paddle/cinn/runtime/flags.cc
@@ -69,6 +69,10 @@ PD_DEFINE_bool(cinn_bucket_compile,
                BoolFromEnv("FLAGS_cinn_bucket_compile", false),
                "Whether to enable bucket compile for dynamic shape.");
 
+PD_DEFINE_bool(group_schedule_tiling_first,
+               BoolFromEnv("FLAGS_group_schedule_tiling_first", false),
+               "Whether to enable new group scheduler tiling first strategy.");
+
 PD_DEFINE_bool(cinn_use_common_subexpression_elimination,
                BoolFromEnv("FLAGS_cinn_use_common_subexpression_elimination",
                            false),